1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 from __future__ import absolute_import
8
9 import re
10 import copy
11 try:
12 from urlparse import urlsplit
13 from urllib import unquote_plus
14 except ImportError:
15
16 from urllib.parse import urlsplit, unquote_plus
17 from lxml import etree
18 from lxml.html import defs
19 from lxml.html import fromstring, XHTML_NAMESPACE
20 from lxml.html import xhtml_to_html, _transform_result
21
22 try:
23 unichr
24 except NameError:
25
26 unichr = chr
27 try:
28 unicode
29 except NameError:
30
31 unicode = str
32 try:
33 bytes
34 except NameError:
35
36 bytes = str
37 try:
38 basestring
39 except NameError:
40 basestring = (str, bytes)
41
42
43 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
44 'word_break', 'word_break_html']
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 _css_javascript_re = re.compile(
68 r'expression\s*\(.*?\)', re.S|re.I)
69
70
71 _css_import_re = re.compile(
72 r'@\s*import', re.I)
73
74
75
76 _is_image_dataurl = re.compile(
77 r'^data:image/.+;base64', re.I).search
78 _is_possibly_malicious_scheme = re.compile(
79 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
80 re.I).search
85
86 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
87
88
89
90 _conditional_comment_re = re.compile(
91 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
92
93 _find_styled_elements = etree.XPath(
94 "descendant-or-self::*[@style]")
95
96 _find_external_links = etree.XPath(
97 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
98 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
99 namespaces={'x':XHTML_NAMESPACE})
100
101
103 """
104 Instances cleans the document of each of the possible offending
105 elements. The cleaning is controlled by attributes; you can
106 override attributes in a subclass, or set them in the constructor.
107
108 ``scripts``:
109 Removes any ``<script>`` tags.
110
111 ``javascript``:
112 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
113 as they could contain Javascript.
114
115 ``comments``:
116 Removes any comments.
117
118 ``style``:
119 Removes any style tags.
120
121 ``inline_style``
122 Removes any style attributes. Defaults to the value of the ``style`` option.
123
124 ``links``:
125 Removes any ``<link>`` tags
126
127 ``meta``:
128 Removes any ``<meta>`` tags
129
130 ``page_structure``:
131 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
132
133 ``processing_instructions``:
134 Removes any processing instructions.
135
136 ``embedded``:
137 Removes any embedded objects (flash, iframes)
138
139 ``frames``:
140 Removes any frame-related tags
141
142 ``forms``:
143 Removes any form tags
144
145 ``annoying_tags``:
146 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
147
148 ``remove_tags``:
149 A list of tags to remove. Only the tags will be removed,
150 their content will get pulled up into the parent tag.
151
152 ``kill_tags``:
153 A list of tags to kill. Killing also removes the tag's content,
154 i.e. the whole subtree, not just the tag itself.
155
156 ``allow_tags``:
157 A list of tags to include (default include all).
158
159 ``remove_unknown_tags``:
160 Remove any tags that aren't standard parts of HTML.
161
162 ``safe_attrs_only``:
163 If true, only include 'safe' attributes (specifically the list
164 from the feedparser HTML sanitisation web site).
165
166 ``safe_attrs``:
167 A set of attribute names to override the default list of attributes
168 considered 'safe' (when safe_attrs_only=True).
169
170 ``add_nofollow``:
171 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
172
173 ``host_whitelist``:
174 A list or set of hosts that you can use for embedded content
175 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
176 You can also implement/override the method
177 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
178 implement more complex rules for what can be embedded.
179 Anything that passes this test will be shown, regardless of
180 the value of (for instance) ``embedded``.
181
182 Note that this parameter might not work as intended if you do not
183 make the links absolute before doing the cleaning.
184
185 Note that you may also need to set ``whitelist_tags``.
186
187 ``whitelist_tags``:
188 A set of tags that can be included with ``host_whitelist``.
189 The default is ``iframe`` and ``embed``; you may wish to
190 include other tags like ``script``, or you may want to
191 implement ``allow_embedded_url`` for more control. Set to None to
192 include all tags.
193
194 This modifies the document *in place*.
195 """
196
197 scripts = True
198 javascript = True
199 comments = True
200 style = False
201 inline_style = None
202 links = True
203 meta = True
204 page_structure = True
205 processing_instructions = True
206 embedded = True
207 frames = True
208 forms = True
209 annoying_tags = True
210 remove_tags = None
211 allow_tags = None
212 kill_tags = None
213 remove_unknown_tags = True
214 safe_attrs_only = True
215 safe_attrs = defs.safe_attrs
216 add_nofollow = False
217 host_whitelist = ()
218 whitelist_tags = set(['iframe', 'embed'])
219
228
229
230
231 _tag_link_attrs = dict(
232 script='src',
233 link='href',
234
235
236 applet=['code', 'object'],
237 iframe='src',
238 embed='src',
239 layer='src',
240
241
242
243
244
245
246
247
248 a='href',
249 )
250
252 """
253 Cleans the document.
254 """
255 if hasattr(doc, 'getroot'):
256
257 doc = doc.getroot()
258
259 xhtml_to_html(doc)
260
261
262 for el in doc.iter('image'):
263 el.tag = 'img'
264 if not self.comments:
265
266
267 self.kill_conditional_comments(doc)
268
269 kill_tags = set(self.kill_tags or ())
270 remove_tags = set(self.remove_tags or ())
271 allow_tags = set(self.allow_tags or ())
272
273 if self.scripts:
274 kill_tags.add('script')
275 if self.safe_attrs_only:
276 safe_attrs = set(self.safe_attrs)
277 for el in doc.iter(etree.Element):
278 attrib = el.attrib
279 for aname in attrib.keys():
280 if aname not in safe_attrs:
281 del attrib[aname]
282 if self.javascript:
283 if not (self.safe_attrs_only and
284 self.safe_attrs == defs.safe_attrs):
285
286 for el in doc.iter(etree.Element):
287 attrib = el.attrib
288 for aname in attrib.keys():
289 if aname.startswith('on'):
290 del attrib[aname]
291 doc.rewrite_links(self._remove_javascript_link,
292 resolve_base_href=False)
293
294
295 if not self.inline_style:
296 for el in _find_styled_elements(doc):
297 old = el.get('style')
298 new = _css_javascript_re.sub('', old)
299 new = _css_import_re.sub('', new)
300 if self._has_sneaky_javascript(new):
301
302 del el.attrib['style']
303 elif new != old:
304 el.set('style', new)
305 if not self.style:
306 for el in list(doc.iter('style')):
307 if el.get('type', '').lower().strip() == 'text/javascript':
308 el.drop_tree()
309 continue
310 old = el.text or ''
311 new = _css_javascript_re.sub('', old)
312
313 new = _css_import_re.sub('', old)
314 if self._has_sneaky_javascript(new):
315
316 el.text = '/* deleted */'
317 elif new != old:
318 el.text = new
319 if self.comments or self.processing_instructions:
320
321
322
323 kill_tags.add(etree.Comment)
324 if self.processing_instructions:
325 kill_tags.add(etree.ProcessingInstruction)
326 if self.style:
327 kill_tags.add('style')
328 if self.inline_style:
329 etree.strip_attributes(doc, 'style')
330 if self.links:
331 kill_tags.add('link')
332 elif self.style or self.javascript:
333
334
335 for el in list(doc.iter('link')):
336 if 'stylesheet' in el.get('rel', '').lower():
337
338 if not self.allow_element(el):
339 el.drop_tree()
340 if self.meta:
341 kill_tags.add('meta')
342 if self.page_structure:
343 remove_tags.update(('head', 'html', 'title'))
344 if self.embedded:
345
346
347
348 for el in list(doc.iter('param')):
349 found_parent = False
350 parent = el.getparent()
351 while parent is not None and parent.tag not in ('applet', 'object'):
352 parent = parent.getparent()
353 if parent is None:
354 el.drop_tree()
355 kill_tags.update(('applet',))
356
357 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
358 if self.frames:
359
360
361
362 kill_tags.update(defs.frame_tags)
363 if self.forms:
364 remove_tags.add('form')
365 kill_tags.update(('button', 'input', 'select', 'textarea'))
366 if self.annoying_tags:
367 remove_tags.update(('blink', 'marquee'))
368
369 _remove = []
370 _kill = []
371 for el in doc.iter():
372 if el.tag in kill_tags:
373 if self.allow_element(el):
374 continue
375 _kill.append(el)
376 elif el.tag in remove_tags:
377 if self.allow_element(el):
378 continue
379 _remove.append(el)
380
381 if _remove and _remove[0] == doc:
382
383
384 el = _remove.pop(0)
385 el.tag = 'div'
386 el.attrib.clear()
387 elif _kill and _kill[0] == doc:
388
389
390 el = _kill.pop(0)
391 if el.tag != 'html':
392 el.tag = 'div'
393 el.clear()
394
395 _kill.reverse()
396 for el in _kill:
397 el.drop_tree()
398 for el in _remove:
399 el.drop_tag()
400
401 if self.remove_unknown_tags:
402 if allow_tags:
403 raise ValueError(
404 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
405 allow_tags = set(defs.tags)
406 if allow_tags:
407 bad = []
408 for el in doc.iter():
409 if el.tag not in allow_tags:
410 bad.append(el)
411 if bad:
412 if bad[0] is doc:
413 el = bad.pop(0)
414 el.tag = 'div'
415 el.attrib.clear()
416 for el in bad:
417 el.drop_tag()
418 if self.add_nofollow:
419 for el in _find_external_links(doc):
420 if not self.allow_follow(el):
421 rel = el.get('rel')
422 if rel:
423 if ('nofollow' in rel
424 and ' nofollow ' in (' %s ' % rel)):
425 continue
426 rel = '%s nofollow' % rel
427 else:
428 rel = 'nofollow'
429 el.set('rel', rel)
430
432 """
433 Override to suppress rel="nofollow" on some anchors.
434 """
435 return False
436
438 if el.tag not in self._tag_link_attrs:
439 return False
440 attr = self._tag_link_attrs[el.tag]
441 if isinstance(attr, (list, tuple)):
442 for one_attr in attr:
443 url = el.get(one_attr)
444 if not url:
445 return False
446 if not self.allow_embedded_url(el, url):
447 return False
448 return True
449 else:
450 url = el.get(attr)
451 if not url:
452 return False
453 return self.allow_embedded_url(el, url)
454
456 if (self.whitelist_tags is not None
457 and el.tag not in self.whitelist_tags):
458 return False
459 scheme, netloc, path, query, fragment = urlsplit(url)
460 netloc = netloc.lower().split(':', 1)[0]
461 if scheme not in ('http', 'https'):
462 return False
463 if netloc in self.host_whitelist:
464 return True
465 return False
466
477
479 bad = []
480 for el in doc.iter(iterate):
481 if condition(el):
482 bad.append(el)
483 for el in bad:
484 el.drop_tree()
485
493
494 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
495
497 """
498 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
499 can get interpreted, or ``expre/* stuff */ssion(...)``. This
500 checks for attempt to do stuff like this.
501
502 Typically the response will be to kill the entire style; if you
503 have just a bit of Javascript in the style another rule will catch
504 that and remove only the Javascript from the style; this catches
505 more sneaky attempts.
506 """
507 style = self._substitute_comments('', style)
508 style = style.replace('\\', '')
509 style = _substitute_whitespace('', style)
510 style = style.lower()
511 if 'javascript:' in style:
512 return True
513 if 'expression(' in style:
514 return True
515 return False
516
525
526 clean = Cleaner()
527 clean_html = clean.clean_html
528
529
530
531
532
533 _link_regexes = [
534 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
535
536 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
537 ]
538
539 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
540
541 _avoid_hosts = [
542 re.compile(r'^localhost', re.I),
543 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
544 re.compile(r'^127\.0\.0\.1$'),
545 ]
546
547 _avoid_classes = ['nolink']
548
553 """
554 Turn any URLs into links.
555
556 It will search for links identified by the given regular
557 expressions (by default mailto and http(s) links).
558
559 It won't link text in an element in avoid_elements, or an element
560 with a class in avoid_classes. It won't link to anything with a
561 host that matches one of the regular expressions in avoid_hosts
562 (default localhost and 127.0.0.1).
563
564 If you pass in an element, the element's tail will not be
565 substituted, only the contents of the element.
566 """
567 if el.tag in avoid_elements:
568 return
569 class_name = el.get('class')
570 if class_name:
571 class_name = class_name.split()
572 for match_class in avoid_classes:
573 if match_class in class_name:
574 return
575 for child in list(el):
576 autolink(child, link_regexes=link_regexes,
577 avoid_elements=avoid_elements,
578 avoid_hosts=avoid_hosts,
579 avoid_classes=avoid_classes)
580 if child.tail:
581 text, tail_children = _link_text(
582 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
583 if tail_children:
584 child.tail = text
585 index = el.index(child)
586 el[index+1:index+1] = tail_children
587 if el.text:
588 text, pre_children = _link_text(
589 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
590 if pre_children:
591 el.text = text
592 el[:0] = pre_children
593
594 -def _link_text(text, link_regexes, avoid_hosts, factory):
595 leading_text = ''
596 links = []
597 last_pos = 0
598 while 1:
599 best_match, best_pos = None, None
600 for regex in link_regexes:
601 regex_pos = last_pos
602 while 1:
603 match = regex.search(text, pos=regex_pos)
604 if match is None:
605 break
606 host = match.group('host')
607 for host_regex in avoid_hosts:
608 if host_regex.search(host):
609 regex_pos = match.end()
610 break
611 else:
612 break
613 if match is None:
614 continue
615 if best_pos is None or match.start() < best_pos:
616 best_match = match
617 best_pos = match.start()
618 if best_match is None:
619
620 if links:
621 assert not links[-1].tail
622 links[-1].tail = text
623 else:
624 assert not leading_text
625 leading_text = text
626 break
627 link = best_match.group(0)
628 end = best_match.end()
629 if link.endswith('.') or link.endswith(','):
630
631 end -= 1
632 link = link[:-1]
633 prev_text = text[:best_match.start()]
634 if links:
635 assert not links[-1].tail
636 links[-1].tail = prev_text
637 else:
638 assert not leading_text
639 leading_text = prev_text
640 anchor = factory('a')
641 anchor.set('href', link)
642 body = best_match.group('body')
643 if not body:
644 body = link
645 if body.endswith('.') or body.endswith(','):
646 body = body[:-1]
647 anchor.text = body
648 links.append(anchor)
649 text = text[end:]
650 return leading_text, links
651
660
661 autolink_html.__doc__ = autolink.__doc__
662
663
664
665
666
667 _avoid_word_break_elements = ['pre', 'textarea', 'code']
668 _avoid_word_break_classes = ['nobreak']
669
674 """
675 Breaks any long words found in the body of the text (not attributes).
676
677 Doesn't effect any of the tags in avoid_elements, by default
678 ``<textarea>`` and ``<pre>``
679
680 Breaks words by inserting ​, which is a unicode character
681 for Zero Width Space character. This generally takes up no space
682 in rendering, but does copy as a space, and in monospace contexts
683 usually takes up space.
684
685 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
686 """
687
688
689 if el.tag in _avoid_word_break_elements:
690 return
691 class_name = el.get('class')
692 if class_name:
693 dont_break = False
694 class_name = class_name.split()
695 for avoid in avoid_classes:
696 if avoid in class_name:
697 dont_break = True
698 break
699 if dont_break:
700 return
701 if el.text:
702 el.text = _break_text(el.text, max_width, break_character)
703 for child in el:
704 word_break(child, max_width=max_width,
705 avoid_elements=avoid_elements,
706 avoid_classes=avoid_classes,
707 break_character=break_character)
708 if child.tail:
709 child.tail = _break_text(child.tail, max_width, break_character)
710
716
717 -def _break_text(text, max_width, break_character):
718 words = text.split()
719 for word in words:
720 if len(word) > max_width:
721 replacement = _insert_break(word, max_width, break_character)
722 text = text.replace(word, replacement)
723 return text
724
725 _break_prefer_re = re.compile(r'[^a-z]', re.I)
726
728 orig_word = word
729 result = ''
730 while len(word) > width:
731 start = word[:width]
732 breaks = list(_break_prefer_re.finditer(start))
733 if breaks:
734 last_break = breaks[-1]
735
736 if last_break.end() > width-10:
737
738
739 start = word[:last_break.end()]
740 result += start + break_character
741 word = word[len(start):]
742 result += word
743 return result
744