1 import threading
2 import re
3 import urlparse
4 import copy
5 from lxml import etree
6 from lxml.html import defs
7 from lxml import cssselect
8 from lxml.html.setmixin import SetMixin
9 try:
10 from UserDict import DictMixin
11 except ImportError:
12
13 from lxml.html._dictmixin import DictMixin
14 import sets
15
16 __all__ = [
17 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
18 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
19 'find_rel_links', 'find_class', 'make_links_absolute',
20 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
21
22 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
23
24 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
25 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
26 _collect_string_content = etree.XPath("string()")
27 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
28 _css_import_re = re.compile(r'@import "(.*?)"')
29 _label_xpath = etree.XPath("//label[@for=$id]")
30
32
34 """
35 Returns the base URL, given when the page was parsed.
36
37 Use with ``urlparse.urljoin(el.base_url, href)`` to get
38 absolute URLs.
39 """
40 return self.getroottree().docinfo.URL
41 base_url = property(base_url, doc=base_url.__doc__)
42
48 forms = property(forms, doc=forms.__doc__)
49
51 """
52 Return the <body> element. Can be called from a child element
53 to get the document's head.
54 """
55 return self.xpath('//body')[0]
56 body = property(body, doc=body.__doc__)
57
59 """
60 Returns the <head> element. Can be called from a child
61 element to get the document's head.
62 """
63 return self.xpath('//head')[0]
64 head = property(head, doc=head.__doc__)
65
67 """
68 Get or set any <label> element associated with this element.
69 """
70 id = self.get('id')
71 if not id:
72 return None
73 result = _label_xpath(self, id=id)
74 if not result:
75 return None
76 else:
77 return result[0]
79 id = self.get('id')
80 if not id:
81 raise TypeError(
82 "You cannot set a label for an element (%r) that has no id"
83 % self)
84 if not label.tag == 'label':
85 raise TypeError(
86 "You can only assign label to a label element (not %r)"
87 % label)
88 label.set('for', id)
93 label = property(label__get, label__set, label__del, doc=label__get.__doc__)
94
96 """
97 Removes this element from the tree, including its children and
98 text. The tail text is joined to the previous element or
99 parent.
100 """
101 parent = self.getparent()
102 assert parent is not None
103 if self.tail:
104 previous = self.getprevious()
105 if previous is None:
106 parent.text = (parent.text or '') + self.tail
107 else:
108 previous.tail = (previous.tail or '') + self.tail
109 parent.remove(self)
110
112 """
113 Remove the tag, but not its children or text. The children and text
114 are merged into the parent.
115
116 Example::
117
118 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
119 >>> h.find('//b').drop_tag()
120 >>> print tostring(h)
121 <div>Hello World!</div>
122 """
123 parent = self.getparent()
124 assert parent is not None
125 previous = self.getprevious()
126 if self.text and isinstance(self.tag, basestring):
127
128 if previous is None:
129 parent.text = (parent.text or '') + self.text
130 else:
131 previous.tail = (previous.tail or '') + self.text
132 if self.tail:
133 if len(self):
134 last = self[-1]
135 last.tail = (last.tail or '') + self.tail
136 elif previous is None:
137 parent.text = (parent.text or '') + self.tail
138 else:
139 previous.tail = (previous.tail or '') + self.tail
140 index = parent.index(self)
141 parent[index:index+1] = self[:]
142
144 """
145 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
146 """
147 rel = rel.lower()
148 return [el for el in _rel_links_xpath(self)
149 if el.get('rel').lower() == rel]
150
152 """
153 Find any elements with the given class name.
154 """
155 return _class_xpath(self, class_name=class_name)
156
158 """
159 Get the first element in a document with the given id. If none is
160 found, return the default argument if provided or raise KeyError
161 otherwise.
162
163 Note that there can be more than one element with the same id,
164 and this isn't uncommon in HTML documents found in the wild.
165 Browsers return only the first match, and this function does
166 the same.
167 """
168 try:
169
170
171 return _id_xpath(self, id=id)[0]
172 except IndexError:
173 if default:
174 return default[0]
175 else:
176 raise KeyError, id
177
178 - def text_content(self):
179 """
180 Return the text content of the tag (and the text in any children).
181 """
182 return _collect_string_content(self)
183
185 """
186 Run the CSS expression on this element and its children,
187 returning a list of the results.
188
189 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
190 that pre-compiling the expression can provide a substantial
191 speedup.
192 """
193 return cssselect.CSSSelect(expr)(self)
194
195
196
197
198
200 """
201 Make all links in the document absolute, given the
202 ``base_url`` for the document (the full URL where the document
203 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
204
205 If ``resolve_base_href`` is true, then any ``<base href>``
206 tags in the document are used *and* removed from the document.
207 If it is false then any such tag is ignored.
208 """
209 if base_url is None:
210 base_url = self.base_url
211 if base_url is None:
212 raise TypeError(
213 "No base_url given, and the document has no base_url")
214 if resolve_base_href:
215 self.resolve_base_href()
216 def link_repl(href):
217 return urlparse.urljoin(base_url, href)
218 self.rewrite_links(link_repl)
219
221 """
222 Find any ``<base href>`` tag in the document, and apply its
223 values to all links found in the document. Also remove the
224 tag once it has been applied.
225 """
226 base_href = None
227 basetags = self.xpath('//base[@href]')
228 for b in basetags:
229 base_href = b.get('href')
230 b.drop_tree()
231 if not base_href:
232 return
233 self.make_links_absolute(base_href, resolve_base_href=False)
234
236 """
237 Yield (element, attribute, link, pos), where attribute may be None
238 (indicating the link is in the text). ``pos`` is the position
239 where the link occurs; often 0, but sometimes something else in
240 the case of links in stylesheets or style tags.
241
242 Note: <base href> is *not* taken into account in any way. The
243 link you get is exactly the link in the document.
244 """
245 link_attrs = defs.link_attrs
246 for el in self.getiterator():
247 attribs = el.attrib
248 for attrib in link_attrs:
249 if attrib in attribs:
250 yield (el, attrib, attribs[attrib], 0)
251 if el.tag == 'style' and el.text:
252 for match in _css_url_re.finditer(el.text):
253 yield (el, None, match.group(1), match.start(1))
254 for match in _css_import_re.finditer(el.text):
255 yield (el, None, match.group(1), match.start(1))
256 if 'style' in attribs:
257 for match in _css_url_re.finditer(attribs['style']):
258 yield (el, 'style', match.group(1), match.start(1))
259
260 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
261 base_href=None):
262 """
263 Rewrite all the links in the document. For each link
264 ``link_repl_func(link)`` will be called, and the return value
265 will replace the old link.
266
267 Note that links may not be absolute (unless you first called
268 ``make_links_absolute()``), and may be internal (e.g.,
269 ``'#anchor'``). They can also be values like
270 ``'mailto:email'`` or ``'javascript:expr'``.
271
272 If you give ``base_href`` then all links passed to
273 ``link_repl_func()`` will take that into account.
274
275 If the ``link_repl_func`` returns None, the attribute or
276 tag text will be removed completely.
277 """
278 if base_href is not None:
279
280
281 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
282 elif resolve_base_href:
283 self.resolve_base_href()
284 for el, attrib, link, pos in self.iterlinks():
285 new_link = link_repl_func(link)
286 if new_link == link:
287 continue
288 if new_link is None:
289
290 if attrib is None:
291 el.text = ''
292 else:
293 del el.attrib[attrib]
294 continue
295 if attrib is None:
296 new = el.text[:pos] + new_link + el.text[pos+len(link):]
297 el.text = new
298 else:
299 cur = el.attrib[attrib]
300 if not pos and len(cur) == len(link):
301
302 el.attrib[attrib] = new_link
303 else:
304 new = cur[:pos] + new_link + cur[pos+len(link):]
305 el.attrib[attrib] = new
306
307
309 """
310 An object that represents a method on an element as a function;
311 the function takes either an element or an HTML string. It
312 returns whatever the function normally returns, or if the function
313 works in-place (and so returns None) it returns a serialized form
314 of the resulting document.
315 """
321 if isinstance(doc, basestring):
322 if 'copy' in kw:
323 raise TypeError(
324 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
325 return_string = True
326 doc = fromstring(doc, **kw)
327 else:
328 if 'copy' in kw:
329 copy = kw.pop('copy')
330 else:
331 copy = self.copy
332 return_string = False
333 if copy:
334 doc = copy.deepcopy(doc)
335 meth = getattr(doc, self.name)
336 result = meth(*args, **kw)
337
338 if result is None:
339
340 if return_string:
341 return tostring(doc)
342 else:
343 return doc
344 else:
345 return result
346
347 find_rel_links = _MethodFunc('find_rel_links', copy=False)
348 find_class = _MethodFunc('find_class', copy=False)
349 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
350 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
351 iterlinks = _MethodFunc('iterlinks', copy=False)
352 rewrite_links = _MethodFunc('rewrite_links', copy=True)
353
356
359
362
365
366
368 """A lookup scheme for HTML Element classes.
369
370 To create a lookup instance with different Element classes, pass a tag
371 name mapping of Element classes in the ``classes`` keyword argument and/or
372 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
373 The special key '*' denotes a Mixin class that should be mixed into all
374 Element classes.
375 """
376 _default_element_classes = {}
377
378 - def __init__(self, classes=None, mixins=None):
395
396 - def lookup(self, node_type, document, namespace, name):
407
408
409 html_parser = etree.HTMLParser()
410
417
419 """
420 Parses several HTML elements, returning a list of elements.
421
422 The first item in the list may be a string (though leading
423 whitespace is removed). If no_leading_text is true, then it will
424 be an error if there is leading text, and it will always be a list
425 of only elements.
426 """
427
428 start = html[:20].lstrip().lower()
429 if not start.startswith('<html') and not start.startswith('<!doctype'):
430 html = '<html><body>%s</body></html>' % html
431 doc = document_fromstring(html, **kw)
432 assert doc.tag == 'html'
433 bodies = [e for e in doc if e.tag == 'body']
434 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
435 body = bodies[0]
436 elements = []
437 if no_leading_text and body.text and body.text.strip():
438 raise etree.ParserError(
439 "There is leading text: %r" % body.text)
440 if body.text and body.text.strip():
441 elements.append(body.text)
442 elements.extend(body)
443
444
445 return elements
446
448 """
449 Parses a single HTML element; it is an error if there is more than
450 one element, or if anything but whitespace precedes or follows the
451 element.
452
453 If create_parent is true (or is a tag name) then a parent node
454 will be created to encapsulate the HTML in a single element.
455 """
456 if create_parent:
457 if not isinstance(create_parent, basestring):
458 create_parent = 'div'
459 return fragment_fromstring('<%s>%s</%s>' % (
460 create_parent, html, create_parent), **kw)
461 elements = fragments_fromstring(html, no_leading_text=True)
462 if not elements:
463 raise etree.ParserError(
464 "No elements found")
465 if len(elements) > 1:
466 raise etree.ParserError(
467 "Multiple elements found (%s)"
468 % ', '.join([_element_name(e) for e in elements]))
469 el = elements[0]
470 if el.tail and el.tail.strip():
471 raise etree.ParserError(
472 "Element followed by text: %r" % el.tail)
473 el.tail = None
474 return el
475
530
531 -def parse(filename, **kw):
532 """
533 Parse a filename, URL, or file-like object into an HTML document.
534
535 You may pass the keyword argument ``base_url='http://...'`` to set
536 the base URL.
537 """
538 return etree.parse(filename, html_parser, **kw)
539
547
549 if isinstance(el, etree.CommentBase):
550 return 'comment'
551 elif isinstance(el, basestring):
552 return 'string'
553 else:
554 return el.tag
555
559
660
661 HtmlElementClassLookup._default_element_classes['form'] = FormElement
662
695
697 import urllib
698
699 if method == 'GET':
700 if '?' in url:
701 url += '&'
702 else:
703 url += '?'
704 url += urllib.urlencode(values)
705 data = None
706 else:
707 data = urllib.urlencode(values)
708 return urllib.urlopen(url, data)
709
711
719 raise KeyError(
720 "You cannot remove keys from ElementDict")
724 return item in self.inputs
725
727 return '<%s for form %s>' % (
728 self.__class__.__name__,
729 self.inputs.form._name())
730
794
822
823 -class TextareaElement(InputMixin, HtmlElement):
824 """
825 ``<textarea>`` element. You can get the name with ``.name`` and
826 get/set the value with ``.value``
827 """
828
829 - def value__get(self):
830 """
831 Get/set the value (which is the contents of this element)
832 """
833 return self.text or ''
834 - def value__set(self, value):
836 - def value__del(self):
838 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
839
840 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
841
843 """
844 ``<select>`` element. You can get the name with ``.name``.
845
846 ``.value`` will be the value of the selected option, unless this
847 is a multi-select element (``<select multiple>``), in which case
848 it will be a set-like object. In either case ``.value_options``
849 gives the possible values.
850
851 The boolean attribute ``.multiple`` shows if this is a
852 multi-select.
853 """
854
856 """
857 Get/set the value of this select (the selected option).
858
859 If this is a multi-select, this is a set-like object that
860 represents all the selected options.
861 """
862 if self.multiple:
863 return MultipleSelectOptions(self)
864 for el in self.getiterator('option'):
865 if 'selected' in el.attrib:
866 value = el.get('value')
867
868 return value
869 return None
870
872 if self.multiple:
873 if isinstance(value, basestring):
874 raise TypeError(
875 "You must pass in a sequence")
876 self.value.clear()
877 self.value.update(value)
878 return
879 if value is not None:
880 for el in self.getiterator('option'):
881
882 if el.get('value') == value:
883 checked_option = el
884 break
885 else:
886 raise ValueError(
887 "There is no option with the value of %r" % value)
888 for el in self.getiterator('option'):
889 if 'selected' in el.attrib:
890 del el.attrib['selected']
891 if value is not None:
892 checked_option.set('selected', '')
893
900
901 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
902
904 """
905 All the possible values this select can have (the ``value``
906 attribute of all the ``<option>`` elements.
907 """
908 return [el.get('value') for el in self.getiterator('option')]
909 value_options = property(value_options, doc=value_options.__doc__)
910
912 """
913 Boolean attribute: is there a ``multiple`` attribute on this element.
914 """
915 return 'multiple' in self.attrib
917 if value:
918 self.set('multiple', '')
919 elif 'multiple' in self.attrib:
920 del self.attrib['multiple']
921 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
922
923 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
924
926 """
927 Represents all the selected options in a ``<select multiple>`` element.
928
929 You can add to this set-like option to select an option, or remove
930 to unselect the option.
931 """
932
935
937 """
938 Iterator of all the ``<option>`` elements.
939 """
940 return self.select.getiterator('option')
941 options = property(options)
942
944 for option in self.options:
945 yield option.get('value')
946
947 - def add(self, item):
948 for option in self.options:
949 if option.get('value') == item:
950 option.set('selected', '')
951 break
952 else:
953 raise ValueError(
954 "There is no option with the value %r" % item)
955
957 for option in self.options:
958 if option.get('value') == item:
959 if 'selected' in option.attrib:
960 del option.attrib['selected']
961 else:
962 raise ValueError(
963 "The option %r is not currently selected" % item)
964 break
965 else:
966 raise ValueError(
967 "There is not option with the value %r" % item)
968
970 return '<%s {%s} for select name=%r>' % (
971 self.__class__.__name__,
972 ', '.join([repr(v) for v in self]),
973 self.select.name)
974
976 """
977 This object represents several ``<input type=radio>`` elements
978 that have the same name.
979
980 You can use this like a list, but also use the property
981 ``.value`` to check/uncheck inputs. Also you can use
982 ``.value_options`` to get the possible values.
983 """
984
986 """
987 Get/set the value, which checks the radio with that value (and
988 unchecks any other value).
989 """
990 for el in self:
991 if 'checked' in el.attrib:
992 return el.get('value')
993 return None
994
996 if value is not None:
997 for el in self:
998 if el.get('value') == value:
999 checked_option = el
1000 break
1001 else:
1002 raise ValueError(
1003 "There is no radio input with the value %r" % value)
1004 for el in self:
1005 if 'checked' in el.attrib:
1006 del el.attrib['checked']
1007 if value is not None:
1008 checked_option.set('checked', '')
1009
1012
1013 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1014
1016 """
1017 Returns a list of all the possible values.
1018 """
1019 return [el.get('value') for el in self]
1020 value_options = property(value_options, doc=value_options.__doc__)
1021
1023 return '%s(%s)' % (
1024 self.__class__.__name__,
1025 list.__repr__(self))
1026
1028 """
1029 Represents a group of checkboxes (``<input type=checkbox>``) that
1030 have the same name.
1031
1032 In addition to using this like a list, the ``.value`` attribute
1033 returns a set-like object that you can add to or remove from to
1034 check and uncheck checkboxes. You can also use ``.value_options``
1035 to get the possible values.
1036 """
1037
1039 """
1040 Return a set-like object that can be modified to check or
1041 uncheck individual checkboxes according to their value.
1042 """
1043 return CheckboxValues(self)
1053 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1054
1056 return '%s(%s)' % (
1057 self.__class__.__name__, list.__repr__(self))
1058
1060
1061 """
1062 Represents the values of the checked checkboxes in a group of
1063 checkboxes with the same name.
1064 """
1065
1068
1070 return iter([
1071 el.get('value')
1072 for el in self.group
1073 if 'checked' in el.attrib])
1074
1075 - def add(self, value):
1076 for el in self.group:
1077 if el.get('value') == value:
1078 el.set('checked', '')
1079 break
1080 else:
1081 raise KeyError("No checkbox with value %r" % value)
1082
1084 for el in self.group:
1085 if el.get('value') == value:
1086 if 'checked' in el.attrib:
1087 del el.attrib['checked']
1088 else:
1089 raise KeyError(
1090 "The checkbox with value %r was already unchecked" % value)
1091 break
1092 else:
1093 raise KeyError(
1094 "No checkbox with value %r" % value)
1095
1097 return '<%s {%s} for checkboxes name=%r>' % (
1098 self.__class__.__name__,
1099 ', '.join([repr(v) for v in self]),
1100 self.group.name)
1101
1185
1186 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1187
1189 """
1190 Represents a ``<label>`` element.
1191
1192 Label elements are linked to other elements with their ``for``
1193 attribute. You can access this element with ``label.for_element``.
1194 """
1195
1197 """
1198 Get/set the element this label points to. Return None if it
1199 can't be found.
1200 """
1201 id = self.get('for')
1202 if not id:
1203 return None
1204 return self.body.get_element_by_id(id)
1206 id = other.get('id')
1207 if not id:
1208 raise TypeError(
1209 "Element %r has no id attribute" % other)
1210 self.set('for', id)
1214 for_element = property(for_element__get, for_element__set, for_element__del,
1215 doc=for_element__get.__doc__)
1216
1217 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1218
1219
1220
1221
1222
1223
1224
1225 __replace_meta_content_type = re.compile(
1226 r'<meta http-equiv="Content-Type".*?>').sub
1227
1228 -def tostring(doc, pretty_print=False, include_meta_content_type=False):
1229 """
1230 return HTML string representation of the document given
1231
1232 note: this will create a meta http-equiv="Content" tag in the head
1233 and may replace any that are present
1234 """
1235 assert doc is not None
1236 html = etree.tostring(doc, method="html", pretty_print=pretty_print)
1237 if not include_meta_content_type:
1238 html = __replace_meta_content_type('', html)
1239 return html
1240
1242 """
1243 Open the HTML document in a web browser (saving it to a temporary
1244 file to open it).
1245 """
1246 import os
1247 import webbrowser
1248 try:
1249 write_doc = doc.write
1250 except AttributeError:
1251 write_doc = etree.ElementTree(element=doc).write
1252 fn = os.tempnam() + '.html'
1253 write_doc(fn, method="html")
1254 url = 'file://' + fn.replace(os.path.sep, '/')
1255 print url
1256 webbrowser.open(url)
1257
1258
1259
1260
1261 html_parser.setElementClassLookup(HtmlElementClassLookup())
1262