1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import _nons, _transform_result
18
19 try:
20 set
21 except NameError:
22
23 from sets import Set as set
24
25 try:
26 unichr = __builtins__['unichr']
27 except (NameError, KeyError):
28
29 unichr = chr
30
31 try:
32 unicode = __builtins__['unicode']
33 except (NameError, KeyError):
34
35 unicode = str
36
37 try:
38 bytes = __builtins__['bytes']
39 except (NameError, KeyError):
40
41 bytes = str
42
43 try:
44 basestring = __builtins__['basestring']
45 except (NameError, KeyError):
46 basestring = (str, bytes)
47
48
49 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
50 'word_break', 'word_break_html']
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 _css_javascript_re = re.compile(
74 r'expression\s*\(.*?\)', re.S|re.I)
75
76
77 _css_import_re = re.compile(
78 r'@\s*import', re.I)
79
80
81
82 _javascript_scheme_re = re.compile(
83 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
84 _substitute_whitespace = re.compile(r'\s+').sub
85
86
87
88 _conditional_comment_re = re.compile(
89 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
90
91 _find_styled_elements = etree.XPath(
92 "descendant-or-self::*[@style]")
93
94 _find_external_links = etree.XPath(
95 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
96 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
97 namespaces={'x':XHTML_NAMESPACE})
98
100 """
101 Instances cleans the document of each of the possible offending
102 elements. The cleaning is controlled by attributes; you can
103 override attributes in a subclass, or set them in the constructor.
104
105 ``scripts``:
106 Removes any ``<script>`` tags.
107
108 ``javascript``:
109 Removes any Javascript, like an ``onclick`` attribute.
110
111 ``comments``:
112 Removes any comments.
113
114 ``style``:
115 Removes any style tags or attributes.
116
117 ``links``:
118 Removes any ``<link>`` tags
119
120 ``meta``:
121 Removes any ``<meta>`` tags
122
123 ``page_structure``:
124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
125
126 ``processing_instructions``:
127 Removes any processing instructions.
128
129 ``embedded``:
130 Removes any embedded objects (flash, iframes)
131
132 ``frames``:
133 Removes any frame-related tags
134
135 ``forms``:
136 Removes any form tags
137
138 ``annoying_tags``:
139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
140
141 ``remove_tags``:
142 A list of tags to remove.
143
144 ``allow_tags``:
145 A list of tags to include (default include all).
146
147 ``remove_unknown_tags``:
148 Remove any tags that aren't standard parts of HTML.
149
150 ``safe_attrs_only``:
151 If true, only include 'safe' attributes (specifically the list
152 from `feedparser
153 <http://feedparser.org/docs/html-sanitization.html>`_).
154
155 ``add_nofollow``:
156 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
157
158 ``host_whitelist``:
159 A list or set of hosts that you can use for embedded content
160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
161 You can also implement/override the method
162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
163 implement more complex rules for what can be embedded.
164 Anything that passes this test will be shown, regardless of
165 the value of (for instance) ``embedded``.
166
167 Note that this parameter might not work as intended if you do not
168 make the links absolute before doing the cleaning.
169
170 ``whitelist_tags``:
171 A set of tags that can be included with ``host_whitelist``.
172 The default is ``iframe`` and ``embed``; you may wish to
173 include other tags like ``script``, or you may want to
174 implement ``allow_embedded_url`` for more control. Set to None to
175 include all tags.
176
177 This modifies the document *in place*.
178 """
179
180 scripts = True
181 javascript = True
182 comments = True
183 style = False
184 links = True
185 meta = True
186 page_structure = True
187 processing_instructions = True
188 embedded = True
189 frames = True
190 forms = True
191 annoying_tags = True
192 remove_tags = None
193 allow_tags = None
194 remove_unknown_tags = True
195 safe_attrs_only = True
196 add_nofollow = False
197 host_whitelist = ()
198 whitelist_tags = set(['iframe', 'embed'])
199
206
207
208
209 _tag_link_attrs = dict(
210 script='src',
211 link='href',
212
213
214 applet=['code', 'object'],
215 iframe='src',
216 embed='src',
217 layer='src',
218
219
220
221
222
223
224
225
226 a='href',
227 )
228
230 """
231 Cleans the document.
232 """
233 if hasattr(doc, 'getroot'):
234
235 doc = doc.getroot()
236
237 for el in doc.iter():
238 tag = el.tag
239 if isinstance(tag, basestring):
240 el.tag = _nons(tag)
241
242
243 for el in doc.iter('image'):
244 el.tag = 'img'
245 if not self.comments:
246
247
248 self.kill_conditional_comments(doc)
249 kill_tags = set()
250 remove_tags = set(self.remove_tags or ())
251 if self.allow_tags:
252 allow_tags = set(self.allow_tags)
253 else:
254 allow_tags = set()
255 if self.scripts:
256 kill_tags.add('script')
257 if self.safe_attrs_only:
258 safe_attrs = set(defs.safe_attrs)
259 for el in doc.iter():
260 attrib = el.attrib
261 for aname in attrib.keys():
262 if aname not in safe_attrs:
263 del attrib[aname]
264 if self.javascript:
265 if not self.safe_attrs_only:
266
267 for el in doc.iter():
268 attrib = el.attrib
269 for aname in attrib.keys():
270 if aname.startswith('on'):
271 del attrib[aname]
272 doc.rewrite_links(self._remove_javascript_link,
273 resolve_base_href=False)
274 if not self.style:
275
276
277 for el in _find_styled_elements(doc):
278 old = el.get('style')
279 new = _css_javascript_re.sub('', old)
280 new = _css_import_re.sub('', old)
281 if self._has_sneaky_javascript(new):
282
283 del el.attrib['style']
284 elif new != old:
285 el.set('style', new)
286 for el in list(doc.iter('style')):
287 if el.get('type', '').lower().strip() == 'text/javascript':
288 el.drop_tree()
289 continue
290 old = el.text or ''
291 new = _css_javascript_re.sub('', old)
292
293 new = _css_import_re.sub('', old)
294 if self._has_sneaky_javascript(new):
295
296 el.text = '/* deleted */'
297 elif new != old:
298 el.text = new
299 if self.comments or self.processing_instructions:
300
301
302
303 kill_tags.add(etree.Comment)
304 if self.processing_instructions:
305 kill_tags.add(etree.ProcessingInstruction)
306 if self.style:
307 kill_tags.add('style')
308 etree.strip_attributes(doc, 'style')
309 if self.links:
310 kill_tags.add('link')
311 elif self.style or self.javascript:
312
313
314 for el in list(doc.iter('link')):
315 if 'stylesheet' in el.get('rel', '').lower():
316
317 el.drop_tree()
318 if self.meta:
319 kill_tags.add('meta')
320 if self.page_structure:
321 remove_tags.update(('head', 'html', 'title'))
322 if self.embedded:
323
324
325
326 for el in list(doc.iter('param')):
327 found_parent = False
328 parent = el.getparent()
329 while parent is not None and parent.tag not in ('applet', 'object'):
330 parent = parent.getparent()
331 if parent is None:
332 el.drop_tree()
333 kill_tags.update(('applet',))
334
335 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
336 if self.frames:
337
338
339
340 kill_tags.update(defs.frame_tags)
341 if self.forms:
342 remove_tags.add('form')
343 kill_tags.update(('button', 'input', 'select', 'textarea'))
344 if self.annoying_tags:
345 remove_tags.update(('blink', 'marque'))
346
347 _remove = []
348 _kill = []
349 for el in doc.iter():
350 if el.tag in kill_tags:
351 if self.allow_element(el):
352 continue
353 _kill.append(el)
354 elif el.tag in remove_tags:
355 if self.allow_element(el):
356 continue
357 _remove.append(el)
358
359 if _remove and _remove[0] == doc:
360
361
362 el = _remove.pop(0)
363 el.tag = 'div'
364 el.attrib.clear()
365 elif _kill and _kill[0] == doc:
366
367
368 el = _kill.pop(0)
369 if el.tag != 'html':
370 el.tag = 'div'
371 el.clear()
372
373 for el in _kill:
374 el.drop_tree()
375 for el in _remove:
376 el.drop_tag()
377
378 allow_tags = self.allow_tags
379 if self.remove_unknown_tags:
380 if allow_tags:
381 raise ValueError(
382 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
383 allow_tags = set(defs.tags)
384 if allow_tags:
385 bad = []
386 for el in doc.iter():
387 if el.tag not in allow_tags:
388 bad.append(el)
389 for el in bad:
390 el.drop_tag()
391 if self.add_nofollow:
392 for el in _find_external_links(doc):
393 if not self.allow_follow(el):
394 el.set('rel', 'nofollow')
395
397 """
398 Override to suppress rel="nofollow" on some anchors.
399 """
400 return False
401
403 if el.tag not in self._tag_link_attrs:
404 return False
405 attr = self._tag_link_attrs[el.tag]
406 if isinstance(attr, (list, tuple)):
407 for one_attr in attr:
408 url = el.get(one_attr)
409 if not url:
410 return False
411 if not self.allow_embedded_url(el, url):
412 return False
413 return True
414 else:
415 url = el.get(attr)
416 if not url:
417 return False
418 return self.allow_embedded_url(el, url)
419
421 if (self.whitelist_tags is not None
422 and el.tag not in self.whitelist_tags):
423 return False
424 scheme, netloc, path, query, fragment = urlsplit(url)
425 netloc = netloc.lower().split(':', 1)[0]
426 if scheme not in ('http', 'https'):
427 return False
428 if netloc in self.host_whitelist:
429 return True
430 return False
431
442
444 bad = []
445 for el in doc.iter(iterate):
446 if condition(el):
447 bad.append(el)
448 for el in bad:
449 el.drop_tree()
450
458
459 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
460
462 """
463 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
464 can get interpreted, or ``expre/* stuff */ssion(...)``. This
465 checks for attempt to do stuff like this.
466
467 Typically the response will be to kill the entire style; if you
468 have just a bit of Javascript in the style another rule will catch
469 that and remove only the Javascript from the style; this catches
470 more sneaky attempts.
471 """
472 style = self._substitute_comments('', style)
473 style = style.replace('\\', '')
474 style = _substitute_whitespace('', style)
475 style = style.lower()
476 if 'javascript:' in style:
477 return True
478 if 'expression(' in style:
479 return True
480 return False
481
490
491 clean = Cleaner()
492 clean_html = clean.clean_html
493
494
495
496
497
498 _link_regexes = [
499 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
500
501 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
502 ]
503
504 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
505
506 _avoid_hosts = [
507 re.compile(r'^localhost', re.I),
508 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
509 re.compile(r'^127\.0\.0\.1$'),
510 ]
511
512 _avoid_classes = ['nolink']
513
518 """
519 Turn any URLs into links.
520
521 It will search for links identified by the given regular
522 expressions (by default mailto and http(s) links).
523
524 It won't link text in an element in avoid_elements, or an element
525 with a class in avoid_classes. It won't link to anything with a
526 host that matches one of the regular expressions in avoid_hosts
527 (default localhost and 127.0.0.1).
528
529 If you pass in an element, the element's tail will not be
530 substituted, only the contents of the element.
531 """
532 if el.tag in avoid_elements:
533 return
534 class_name = el.get('class')
535 if class_name:
536 class_name = class_name.split()
537 for match_class in avoid_classes:
538 if match_class in class_name:
539 return
540 for child in list(el):
541 autolink(child, link_regexes=link_regexes,
542 avoid_elements=avoid_elements,
543 avoid_hosts=avoid_hosts,
544 avoid_classes=avoid_classes)
545 if child.tail:
546 text, tail_children = _link_text(
547 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
548 if tail_children:
549 child.tail = text
550 index = el.index(child)
551 el[index+1:index+1] = tail_children
552 if el.text:
553 text, pre_children = _link_text(
554 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
555 if pre_children:
556 el.text = text
557 el[:0] = pre_children
558
559 -def _link_text(text, link_regexes, avoid_hosts, factory):
560 leading_text = ''
561 links = []
562 last_pos = 0
563 while 1:
564 best_match, best_pos = None, None
565 for regex in link_regexes:
566 regex_pos = last_pos
567 while 1:
568 match = regex.search(text, pos=regex_pos)
569 if match is None:
570 break
571 host = match.group('host')
572 for host_regex in avoid_hosts:
573 if host_regex.search(host):
574 regex_pos = match.end()
575 break
576 else:
577 break
578 if match is None:
579 continue
580 if best_pos is None or match.start() < best_pos:
581 best_match = match
582 best_pos = match.start()
583 if best_match is None:
584
585 if links:
586 assert not links[-1].tail
587 links[-1].tail = text
588 else:
589 assert not leading_text
590 leading_text = text
591 break
592 link = best_match.group(0)
593 end = best_match.end()
594 if link.endswith('.') or link.endswith(','):
595
596 end -= 1
597 link = link[:-1]
598 prev_text = text[:best_match.start()]
599 if links:
600 assert not links[-1].tail
601 links[-1].tail = prev_text
602 else:
603 assert not leading_text
604 leading_text = prev_text
605 anchor = factory('a')
606 anchor.set('href', link)
607 body = best_match.group('body')
608 if not body:
609 body = link
610 if body.endswith('.') or body.endswith(','):
611 body = body[:-1]
612 anchor.text = body
613 links.append(anchor)
614 text = text[end:]
615 return leading_text, links
616
625
626 autolink_html.__doc__ = autolink.__doc__
627
628
629
630
631
632 _avoid_word_break_elements = ['pre', 'textarea', 'code']
633 _avoid_word_break_classes = ['nobreak']
634
639 """
640 Breaks any long words found in the body of the text (not attributes).
641
642 Doesn't effect any of the tags in avoid_elements, by default
643 ``<textarea>`` and ``<pre>``
644
645 Breaks words by inserting ​, which is a unicode character
646 for Zero Width Space character. This generally takes up no space
647 in rendering, but does copy as a space, and in monospace contexts
648 usually takes up space.
649
650 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
651 """
652
653
654 if el.tag in _avoid_word_break_elements:
655 return
656 class_name = el.get('class')
657 if class_name:
658 dont_break = False
659 class_name = class_name.split()
660 for avoid in avoid_classes:
661 if avoid in class_name:
662 dont_break = True
663 break
664 if dont_break:
665 return
666 if el.text:
667 el.text = _break_text(el.text, max_width, break_character)
668 for child in el:
669 word_break(child, max_width=max_width,
670 avoid_elements=avoid_elements,
671 avoid_classes=avoid_classes,
672 break_character=break_character)
673 if child.tail:
674 child.tail = _break_text(child.tail, max_width, break_character)
675
681
682 -def _break_text(text, max_width, break_character):
683 words = text.split()
684 for word in words:
685 if len(word) > max_width:
686 replacement = _insert_break(word, max_width, break_character)
687 text = text.replace(word, replacement)
688 return text
689
690 _break_prefer_re = re.compile(r'[^a-z]', re.I)
691
693 orig_word = word
694 result = ''
695 while len(word) > width:
696 start = word[:width]
697 breaks = list(_break_prefer_re.finditer(start))
698 if breaks:
699 last_break = breaks[-1]
700
701 if last_break.end() > width-10:
702
703
704 start = word[:last_break.end()]
705 result += start + break_character
706 word = word[len(start):]
707 result += word
708 return result
709