1 import threading
2 import re
3 import urlparse
4 import copy
5 from lxml import etree
6 from lxml.html import defs
7 from lxml import cssselect
8 from lxml.html.setmixin import SetMixin
9 try:
10 from UserDict import DictMixin
11 except ImportError:
12
13 from lxml.html._dictmixin import DictMixin
14 import sets
15
16 __all__ = [
17 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
18 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
19 'find_rel_links', 'find_class', 'make_links_absolute',
20 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
21
22 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
23
24 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
25 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
26 _collect_string_content = etree.XPath("string()")
27 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
28 _css_import_re = re.compile(r'@import "(.*?)"')
29 _label_xpath = etree.XPath("//label[@for=$id]")
30 _archive_re = re.compile(r'[^ ]+')
31
33
35 """
36 Returns the base URL, given when the page was parsed.
37
38 Use with ``urlparse.urljoin(el.base_url, href)`` to get
39 absolute URLs.
40 """
41 return self.getroottree().docinfo.URL
42 base_url = property(base_url, doc=base_url.__doc__)
43
49 forms = property(forms, doc=forms.__doc__)
50
52 """
53 Return the <body> element. Can be called from a child element
54 to get the document's head.
55 """
56 return self.xpath('//body')[0]
57 body = property(body, doc=body.__doc__)
58
60 """
61 Returns the <head> element. Can be called from a child
62 element to get the document's head.
63 """
64 return self.xpath('//head')[0]
65 head = property(head, doc=head.__doc__)
66
68 """
69 Get or set any <label> element associated with this element.
70 """
71 id = self.get('id')
72 if not id:
73 return None
74 result = _label_xpath(self, id=id)
75 if not result:
76 return None
77 else:
78 return result[0]
80 id = self.get('id')
81 if not id:
82 raise TypeError(
83 "You cannot set a label for an element (%r) that has no id"
84 % self)
85 if not label.tag == 'label':
86 raise TypeError(
87 "You can only assign label to a label element (not %r)"
88 % label)
89 label.set('for', id)
94 label = property(label__get, label__set, label__del, doc=label__get.__doc__)
95
97 """
98 Removes this element from the tree, including its children and
99 text. The tail text is joined to the previous element or
100 parent.
101 """
102 parent = self.getparent()
103 assert parent is not None
104 if self.tail:
105 previous = self.getprevious()
106 if previous is None:
107 parent.text = (parent.text or '') + self.tail
108 else:
109 previous.tail = (previous.tail or '') + self.tail
110 parent.remove(self)
111
113 """
114 Remove the tag, but not its children or text. The children and text
115 are merged into the parent.
116
117 Example::
118
119 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
120 >>> h.find('//b').drop_tag()
121 >>> print tostring(h)
122 <div>Hello World!</div>
123 """
124 parent = self.getparent()
125 assert parent is not None
126 previous = self.getprevious()
127 if self.text and isinstance(self.tag, basestring):
128
129 if previous is None:
130 parent.text = (parent.text or '') + self.text
131 else:
132 previous.tail = (previous.tail or '') + self.text
133 if self.tail:
134 if len(self):
135 last = self[-1]
136 last.tail = (last.tail or '') + self.tail
137 elif previous is None:
138 parent.text = (parent.text or '') + self.tail
139 else:
140 previous.tail = (previous.tail or '') + self.tail
141 index = parent.index(self)
142 parent[index:index+1] = self[:]
143
145 """
146 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
147 """
148 rel = rel.lower()
149 return [el for el in _rel_links_xpath(self)
150 if el.get('rel').lower() == rel]
151
153 """
154 Find any elements with the given class name.
155 """
156 return _class_xpath(self, class_name=class_name)
157
159 """
160 Get the first element in a document with the given id. If none is
161 found, return the default argument if provided or raise KeyError
162 otherwise.
163
164 Note that there can be more than one element with the same id,
165 and this isn't uncommon in HTML documents found in the wild.
166 Browsers return only the first match, and this function does
167 the same.
168 """
169 try:
170
171
172 return _id_xpath(self, id=id)[0]
173 except IndexError:
174 if default:
175 return default[0]
176 else:
177 raise KeyError, id
178
179 - def text_content(self):
180 """
181 Return the text content of the tag (and the text in any children).
182 """
183 return _collect_string_content(self)
184
186 """
187 Run the CSS expression on this element and its children,
188 returning a list of the results.
189
190 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
191 that pre-compiling the expression can provide a substantial
192 speedup.
193 """
194 return cssselect.CSSSelect(expr)(self)
195
196
197
198
199
201 """
202 Make all links in the document absolute, given the
203 ``base_url`` for the document (the full URL where the document
204 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
205
206 If ``resolve_base_href`` is true, then any ``<base href>``
207 tags in the document are used *and* removed from the document.
208 If it is false then any such tag is ignored.
209 """
210 if base_url is None:
211 base_url = self.base_url
212 if base_url is None:
213 raise TypeError(
214 "No base_url given, and the document has no base_url")
215 if resolve_base_href:
216 self.resolve_base_href()
217 def link_repl(href):
218 return urlparse.urljoin(base_url, href)
219 self.rewrite_links(link_repl)
220
222 """
223 Find any ``<base href>`` tag in the document, and apply its
224 values to all links found in the document. Also remove the
225 tag once it has been applied.
226 """
227 base_href = None
228 basetags = self.xpath('//base[@href]')
229 for b in basetags:
230 base_href = b.get('href')
231 b.drop_tree()
232 if not base_href:
233 return
234 self.make_links_absolute(base_href, resolve_base_href=False)
235
237 """
238 Yield (element, attribute, link, pos), where attribute may be None
239 (indicating the link is in the text). ``pos`` is the position
240 where the link occurs; often 0, but sometimes something else in
241 the case of links in stylesheets or style tags.
242
243 Note: <base href> is *not* taken into account in any way. The
244 link you get is exactly the link in the document.
245 """
246 link_attrs = defs.link_attrs
247 for el in self.getiterator():
248 attribs = el.attrib
249 if el.tag != 'object':
250 for attrib in link_attrs:
251 if attrib in attribs:
252 yield (el, attrib, attribs[attrib], 0)
253 elif el.tag == 'object':
254 codebase = None
255
256
257 if 'codebase' in attribs:
258 codebase = el.get('codebase')
259 yield (el, 'codebase', codebase, 0)
260 for attrib in 'classid', 'data':
261 if attrib in attribs:
262 value = el.get(attrib)
263 if codebase is not None:
264 value = urlparse.urljoin(codebase, value)
265 yield (el, attrib, value, 0)
266 if 'archive' in attribs:
267 for match in _archive_re.finditer(el.get('archive')):
268 value = match.group(0)
269 if codebase is not None:
270 value = urlparse.urljoin(codebase, value)
271 yield (el, 'archive', value, match.start())
272 if el.tag == 'param':
273 valuetype = el.get('valuetype') or ''
274 if valuetype.lower() == 'ref':
275
276
277
278
279
280
281 yield (el, 'value', el.get('value'), 0)
282 if el.tag == 'style' and el.text:
283 for match in _css_url_re.finditer(el.text):
284 yield (el, None, match.group(1), match.start(1))
285 for match in _css_import_re.finditer(el.text):
286 yield (el, None, match.group(1), match.start(1))
287 if 'style' in attribs:
288 for match in _css_url_re.finditer(attribs['style']):
289 yield (el, 'style', match.group(1), match.start(1))
290
291 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
292 base_href=None):
293 """
294 Rewrite all the links in the document. For each link
295 ``link_repl_func(link)`` will be called, and the return value
296 will replace the old link.
297
298 Note that links may not be absolute (unless you first called
299 ``make_links_absolute()``), and may be internal (e.g.,
300 ``'#anchor'``). They can also be values like
301 ``'mailto:email'`` or ``'javascript:expr'``.
302
303 If you give ``base_href`` then all links passed to
304 ``link_repl_func()`` will take that into account.
305
306 If the ``link_repl_func`` returns None, the attribute or
307 tag text will be removed completely.
308 """
309 if base_href is not None:
310
311
312 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
313 elif resolve_base_href:
314 self.resolve_base_href()
315 for el, attrib, link, pos in self.iterlinks():
316 new_link = link_repl_func(link)
317 if new_link == link:
318 continue
319 if new_link is None:
320
321 if attrib is None:
322 el.text = ''
323 else:
324 del el.attrib[attrib]
325 continue
326 if attrib is None:
327 new = el.text[:pos] + new_link + el.text[pos+len(link):]
328 el.text = new
329 else:
330 cur = el.attrib[attrib]
331 if not pos and len(cur) == len(link):
332
333 el.attrib[attrib] = new_link
334 else:
335 new = cur[:pos] + new_link + cur[pos+len(link):]
336 el.attrib[attrib] = new
337
338
340 """
341 An object that represents a method on an element as a function;
342 the function takes either an element or an HTML string. It
343 returns whatever the function normally returns, or if the function
344 works in-place (and so returns None) it returns a serialized form
345 of the resulting document.
346 """
352 if isinstance(doc, basestring):
353 if 'copy' in kw:
354 raise TypeError(
355 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
356 return_string = True
357 doc = fromstring(doc, **kw)
358 else:
359 if 'copy' in kw:
360 copy = kw.pop('copy')
361 else:
362 copy = self.copy
363 return_string = False
364 if copy:
365 doc = copy.deepcopy(doc)
366 meth = getattr(doc, self.name)
367 result = meth(*args, **kw)
368
369 if result is None:
370
371 if return_string:
372 return tostring(doc)
373 else:
374 return doc
375 else:
376 return result
377
378 find_rel_links = _MethodFunc('find_rel_links', copy=False)
379 find_class = _MethodFunc('find_class', copy=False)
380 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
381 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
382 iterlinks = _MethodFunc('iterlinks', copy=False)
383 rewrite_links = _MethodFunc('rewrite_links', copy=True)
384
387
390
393
396
397
399 """A lookup scheme for HTML Element classes.
400
401 To create a lookup instance with different Element classes, pass a tag
402 name mapping of Element classes in the ``classes`` keyword argument and/or
403 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
404 The special key '*' denotes a Mixin class that should be mixed into all
405 Element classes.
406 """
407 _default_element_classes = {}
408
409 - def __init__(self, classes=None, mixins=None):
426
427 - def lookup(self, node_type, document, namespace, name):
438
439
440
441
442
449
451 """
452 Parses several HTML elements, returning a list of elements.
453
454 The first item in the list may be a string (though leading
455 whitespace is removed). If no_leading_text is true, then it will
456 be an error if there is leading text, and it will always be a list
457 of only elements.
458 """
459
460 start = html[:20].lstrip().lower()
461 if not start.startswith('<html') and not start.startswith('<!doctype'):
462 html = '<html><body>%s</body></html>' % html
463 doc = document_fromstring(html, **kw)
464 assert doc.tag == 'html'
465 bodies = [e for e in doc if e.tag == 'body']
466 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
467 body = bodies[0]
468 elements = []
469 if no_leading_text and body.text and body.text.strip():
470 raise etree.ParserError(
471 "There is leading text: %r" % body.text)
472 if body.text and body.text.strip():
473 elements.append(body.text)
474 elements.extend(body)
475
476
477 return elements
478
480 """
481 Parses a single HTML element; it is an error if there is more than
482 one element, or if anything but whitespace precedes or follows the
483 element.
484
485 If create_parent is true (or is a tag name) then a parent node
486 will be created to encapsulate the HTML in a single element.
487 """
488 if create_parent:
489 if not isinstance(create_parent, basestring):
490 create_parent = 'div'
491 return fragment_fromstring('<%s>%s</%s>' % (
492 create_parent, html, create_parent), **kw)
493 elements = fragments_fromstring(html, no_leading_text=True)
494 if not elements:
495 raise etree.ParserError(
496 "No elements found")
497 if len(elements) > 1:
498 raise etree.ParserError(
499 "Multiple elements found (%s)"
500 % ', '.join([_element_name(e) for e in elements]))
501 el = elements[0]
502 if el.tail and el.tail.strip():
503 raise etree.ParserError(
504 "Element followed by text: %r" % el.tail)
505 el.tail = None
506 return el
507
562
563 -def parse(filename, parser=None, **kw):
564 """
565 Parse a filename, URL, or file-like object into an HTML document.
566
567 You may pass the keyword argument ``base_url='http://...'`` to set
568 the base URL.
569 """
570 if parser is None:
571 parser = html_parser
572 return etree.parse(filename, parser, **kw)
573
581
583 if isinstance(el, etree.CommentBase):
584 return 'comment'
585 elif isinstance(el, basestring):
586 return 'string'
587 else:
588 return el.tag
589
590
591
592
593
694
695 HtmlElementClassLookup._default_element_classes['form'] = FormElement
696
729
731 import urllib
732
733 if method == 'GET':
734 if '?' in url:
735 url += '&'
736 else:
737 url += '?'
738 url += urllib.urlencode(values)
739 data = None
740 else:
741 data = urllib.urlencode(values)
742 return urllib.urlopen(url, data)
743
745
753 raise KeyError(
754 "You cannot remove keys from ElementDict")
758 return item in self.inputs
759
761 return '<%s for form %s>' % (
762 self.__class__.__name__,
763 self.inputs.form._name())
764
828
856
857 -class TextareaElement(InputMixin, HtmlElement):
858 """
859 ``<textarea>`` element. You can get the name with ``.name`` and
860 get/set the value with ``.value``
861 """
862
863 - def value__get(self):
864 """
865 Get/set the value (which is the contents of this element)
866 """
867 return self.text or ''
868 - def value__set(self, value):
870 - def value__del(self):
872 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
873
874 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
875
877 """
878 ``<select>`` element. You can get the name with ``.name``.
879
880 ``.value`` will be the value of the selected option, unless this
881 is a multi-select element (``<select multiple>``), in which case
882 it will be a set-like object. In either case ``.value_options``
883 gives the possible values.
884
885 The boolean attribute ``.multiple`` shows if this is a
886 multi-select.
887 """
888
890 """
891 Get/set the value of this select (the selected option).
892
893 If this is a multi-select, this is a set-like object that
894 represents all the selected options.
895 """
896 if self.multiple:
897 return MultipleSelectOptions(self)
898 for el in self.getiterator('option'):
899 if 'selected' in el.attrib:
900 value = el.get('value')
901
902 return value
903 return None
904
906 if self.multiple:
907 if isinstance(value, basestring):
908 raise TypeError(
909 "You must pass in a sequence")
910 self.value.clear()
911 self.value.update(value)
912 return
913 if value is not None:
914 for el in self.getiterator('option'):
915
916 if el.get('value') == value:
917 checked_option = el
918 break
919 else:
920 raise ValueError(
921 "There is no option with the value of %r" % value)
922 for el in self.getiterator('option'):
923 if 'selected' in el.attrib:
924 del el.attrib['selected']
925 if value is not None:
926 checked_option.set('selected', '')
927
934
935 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
936
938 """
939 All the possible values this select can have (the ``value``
940 attribute of all the ``<option>`` elements.
941 """
942 return [el.get('value') for el in self.getiterator('option')]
943 value_options = property(value_options, doc=value_options.__doc__)
944
946 """
947 Boolean attribute: is there a ``multiple`` attribute on this element.
948 """
949 return 'multiple' in self.attrib
951 if value:
952 self.set('multiple', '')
953 elif 'multiple' in self.attrib:
954 del self.attrib['multiple']
955 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
956
957 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
958
960 """
961 Represents all the selected options in a ``<select multiple>`` element.
962
963 You can add to this set-like option to select an option, or remove
964 to unselect the option.
965 """
966
969
971 """
972 Iterator of all the ``<option>`` elements.
973 """
974 return self.select.getiterator('option')
975 options = property(options)
976
978 for option in self.options:
979 yield option.get('value')
980
981 - def add(self, item):
982 for option in self.options:
983 if option.get('value') == item:
984 option.set('selected', '')
985 break
986 else:
987 raise ValueError(
988 "There is no option with the value %r" % item)
989
991 for option in self.options:
992 if option.get('value') == item:
993 if 'selected' in option.attrib:
994 del option.attrib['selected']
995 else:
996 raise ValueError(
997 "The option %r is not currently selected" % item)
998 break
999 else:
1000 raise ValueError(
1001 "There is not option with the value %r" % item)
1002
1004 return '<%s {%s} for select name=%r>' % (
1005 self.__class__.__name__,
1006 ', '.join([repr(v) for v in self]),
1007 self.select.name)
1008
1010 """
1011 This object represents several ``<input type=radio>`` elements
1012 that have the same name.
1013
1014 You can use this like a list, but also use the property
1015 ``.value`` to check/uncheck inputs. Also you can use
1016 ``.value_options`` to get the possible values.
1017 """
1018
1020 """
1021 Get/set the value, which checks the radio with that value (and
1022 unchecks any other value).
1023 """
1024 for el in self:
1025 if 'checked' in el.attrib:
1026 return el.get('value')
1027 return None
1028
1030 if value is not None:
1031 for el in self:
1032 if el.get('value') == value:
1033 checked_option = el
1034 break
1035 else:
1036 raise ValueError(
1037 "There is no radio input with the value %r" % value)
1038 for el in self:
1039 if 'checked' in el.attrib:
1040 del el.attrib['checked']
1041 if value is not None:
1042 checked_option.set('checked', '')
1043
1046
1047 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1048
1050 """
1051 Returns a list of all the possible values.
1052 """
1053 return [el.get('value') for el in self]
1054 value_options = property(value_options, doc=value_options.__doc__)
1055
1057 return '%s(%s)' % (
1058 self.__class__.__name__,
1059 list.__repr__(self))
1060
1062 """
1063 Represents a group of checkboxes (``<input type=checkbox>``) that
1064 have the same name.
1065
1066 In addition to using this like a list, the ``.value`` attribute
1067 returns a set-like object that you can add to or remove from to
1068 check and uncheck checkboxes. You can also use ``.value_options``
1069 to get the possible values.
1070 """
1071
1073 """
1074 Return a set-like object that can be modified to check or
1075 uncheck individual checkboxes according to their value.
1076 """
1077 return CheckboxValues(self)
1087 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
1088
1090 return '%s(%s)' % (
1091 self.__class__.__name__, list.__repr__(self))
1092
1094
1095 """
1096 Represents the values of the checked checkboxes in a group of
1097 checkboxes with the same name.
1098 """
1099
1102
1104 return iter([
1105 el.get('value')
1106 for el in self.group
1107 if 'checked' in el.attrib])
1108
1109 - def add(self, value):
1110 for el in self.group:
1111 if el.get('value') == value:
1112 el.set('checked', '')
1113 break
1114 else:
1115 raise KeyError("No checkbox with value %r" % value)
1116
1118 for el in self.group:
1119 if el.get('value') == value:
1120 if 'checked' in el.attrib:
1121 del el.attrib['checked']
1122 else:
1123 raise KeyError(
1124 "The checkbox with value %r was already unchecked" % value)
1125 break
1126 else:
1127 raise KeyError(
1128 "No checkbox with value %r" % value)
1129
1131 return '<%s {%s} for checkboxes name=%r>' % (
1132 self.__class__.__name__,
1133 ', '.join([repr(v) for v in self]),
1134 self.group.name)
1135
1219
1220 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1221
1223 """
1224 Represents a ``<label>`` element.
1225
1226 Label elements are linked to other elements with their ``for``
1227 attribute. You can access this element with ``label.for_element``.
1228 """
1229
1231 """
1232 Get/set the element this label points to. Return None if it
1233 can't be found.
1234 """
1235 id = self.get('for')
1236 if not id:
1237 return None
1238 return self.body.get_element_by_id(id)
1240 id = other.get('id')
1241 if not id:
1242 raise TypeError(
1243 "Element %r has no id attribute" % other)
1244 self.set('for', id)
1248 for_element = property(for_element__get, for_element__set, for_element__del,
1249 doc=for_element__get.__doc__)
1250
1251 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1252
1253
1254
1255
1256
1257
1258
1259 __replace_meta_content_type = re.compile(
1260 r'<meta http-equiv="Content-Type".*?>').sub
1261
1262 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1263 encoding=None):
1264 """
1265 return HTML string representation of the document given
1266
1267 note: if include_meta_content_type is true this will create a meta
1268 http-equiv="Content" tag in the head; regardless of the value of include_meta_content_type
1269 any existing meta http-equiv="Content" tag will be removed
1270 """
1271 assert doc is not None
1272 html = etree.tostring(doc, method="html", pretty_print=pretty_print,
1273 encoding=encoding)
1274 if not include_meta_content_type:
1275 html = __replace_meta_content_type('', html)
1276 return html
1277
1279 """
1280 Open the HTML document in a web browser (saving it to a temporary
1281 file to open it).
1282 """
1283 import os
1284 import webbrowser
1285 try:
1286 write_doc = doc.write
1287 except AttributeError:
1288 write_doc = etree.ElementTree(element=doc).write
1289 fn = os.tempnam() + '.html'
1290 write_doc(fn, method="html")
1291 url = 'file://' + fn.replace(os.path.sep, '/')
1292 print url
1293 webbrowser.open(url)
1294
1295
1296
1297
1298
1303
1307
1308 html_parser = HTMLParser()
1309