1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _is_image_dataurl = re.compile(
74 r'^data:image/.+;base64', re.I).search
75 _is_possibly_malicious_scheme = re.compile(
76 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
77 re.I).search
82
83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
84
85
86
87 _conditional_comment_re = re.compile(
88 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
89
90 _find_styled_elements = etree.XPath(
91 "descendant-or-self::*[@style]")
92
93 _find_external_links = etree.XPath(
94 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
95 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
96 namespaces={'x':XHTML_NAMESPACE})
97
99 """
100 Instances cleans the document of each of the possible offending
101 elements. The cleaning is controlled by attributes; you can
102 override attributes in a subclass, or set them in the constructor.
103
104 ``scripts``:
105 Removes any ``<script>`` tags.
106
107 ``javascript``:
108 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
109 as they could contain Javascript.
110
111 ``comments``:
112 Removes any comments.
113
114 ``style``:
115 Removes any style tags or attributes.
116
117 ``links``:
118 Removes any ``<link>`` tags
119
120 ``meta``:
121 Removes any ``<meta>`` tags
122
123 ``page_structure``:
124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
125
126 ``processing_instructions``:
127 Removes any processing instructions.
128
129 ``embedded``:
130 Removes any embedded objects (flash, iframes)
131
132 ``frames``:
133 Removes any frame-related tags
134
135 ``forms``:
136 Removes any form tags
137
138 ``annoying_tags``:
139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
140
141 ``remove_tags``:
142 A list of tags to remove. Only the tags will be removed,
143 their content will get pulled up into the parent tag.
144
145 ``kill_tags``:
146 A list of tags to kill. Killing also removes the tag's content,
147 i.e. the whole subtree, not just the tag itself.
148
149 ``allow_tags``:
150 A list of tags to include (default include all).
151
152 ``remove_unknown_tags``:
153 Remove any tags that aren't standard parts of HTML.
154
155 ``safe_attrs_only``:
156 If true, only include 'safe' attributes (specifically the list
157 from the feedparser HTML sanitisation web site).
158
159 ``safe_attrs``:
160 A set of attribute names to override the default list of attributes
161 considered 'safe' (when safe_attrs_only=True).
162
163 ``add_nofollow``:
164 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
165
166 ``host_whitelist``:
167 A list or set of hosts that you can use for embedded content
168 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
169 You can also implement/override the method
170 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
171 implement more complex rules for what can be embedded.
172 Anything that passes this test will be shown, regardless of
173 the value of (for instance) ``embedded``.
174
175 Note that this parameter might not work as intended if you do not
176 make the links absolute before doing the cleaning.
177
178 Note that you may also need to set ``whitelist_tags``.
179
180 ``whitelist_tags``:
181 A set of tags that can be included with ``host_whitelist``.
182 The default is ``iframe`` and ``embed``; you may wish to
183 include other tags like ``script``, or you may want to
184 implement ``allow_embedded_url`` for more control. Set to None to
185 include all tags.
186
187 This modifies the document *in place*.
188 """
189
190 scripts = True
191 javascript = True
192 comments = True
193 style = False
194 links = True
195 meta = True
196 page_structure = True
197 processing_instructions = True
198 embedded = True
199 frames = True
200 forms = True
201 annoying_tags = True
202 remove_tags = None
203 allow_tags = None
204 kill_tags = None
205 remove_unknown_tags = True
206 safe_attrs_only = True
207 safe_attrs = defs.safe_attrs
208 add_nofollow = False
209 host_whitelist = ()
210 whitelist_tags = set(['iframe', 'embed'])
211
218
219
220
221 _tag_link_attrs = dict(
222 script='src',
223 link='href',
224
225
226 applet=['code', 'object'],
227 iframe='src',
228 embed='src',
229 layer='src',
230
231
232
233
234
235
236
237
238 a='href',
239 )
240
242 """
243 Cleans the document.
244 """
245 if hasattr(doc, 'getroot'):
246
247 doc = doc.getroot()
248
249 xhtml_to_html(doc)
250
251
252 for el in doc.iter('image'):
253 el.tag = 'img'
254 if not self.comments:
255
256
257 self.kill_conditional_comments(doc)
258
259 kill_tags = set(self.kill_tags or ())
260 remove_tags = set(self.remove_tags or ())
261 allow_tags = set(self.allow_tags or ())
262
263 if self.scripts:
264 kill_tags.add('script')
265 if self.safe_attrs_only:
266 safe_attrs = set(self.safe_attrs)
267 for el in doc.iter(etree.Element):
268 attrib = el.attrib
269 for aname in attrib.keys():
270 if aname not in safe_attrs:
271 del attrib[aname]
272 if self.javascript:
273 if not (self.safe_attrs_only and
274 self.safe_attrs == defs.safe_attrs):
275
276 for el in doc.iter(etree.Element):
277 attrib = el.attrib
278 for aname in attrib.keys():
279 if aname.startswith('on'):
280 del attrib[aname]
281 doc.rewrite_links(self._remove_javascript_link,
282 resolve_base_href=False)
283 if not self.style:
284
285
286 for el in _find_styled_elements(doc):
287 old = el.get('style')
288 new = _css_javascript_re.sub('', old)
289 new = _css_import_re.sub('', new)
290 if self._has_sneaky_javascript(new):
291
292 del el.attrib['style']
293 elif new != old:
294 el.set('style', new)
295 for el in list(doc.iter('style')):
296 if el.get('type', '').lower().strip() == 'text/javascript':
297 el.drop_tree()
298 continue
299 old = el.text or ''
300 new = _css_javascript_re.sub('', old)
301
302 new = _css_import_re.sub('', old)
303 if self._has_sneaky_javascript(new):
304
305 el.text = '/* deleted */'
306 elif new != old:
307 el.text = new
308 if self.comments or self.processing_instructions:
309
310
311
312 kill_tags.add(etree.Comment)
313 if self.processing_instructions:
314 kill_tags.add(etree.ProcessingInstruction)
315 if self.style:
316 kill_tags.add('style')
317 etree.strip_attributes(doc, 'style')
318 if self.links:
319 kill_tags.add('link')
320 elif self.style or self.javascript:
321
322
323 for el in list(doc.iter('link')):
324 if 'stylesheet' in el.get('rel', '').lower():
325
326 if not self.allow_element(el):
327 el.drop_tree()
328 if self.meta:
329 kill_tags.add('meta')
330 if self.page_structure:
331 remove_tags.update(('head', 'html', 'title'))
332 if self.embedded:
333
334
335
336 for el in list(doc.iter('param')):
337 found_parent = False
338 parent = el.getparent()
339 while parent is not None and parent.tag not in ('applet', 'object'):
340 parent = parent.getparent()
341 if parent is None:
342 el.drop_tree()
343 kill_tags.update(('applet',))
344
345 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
346 if self.frames:
347
348
349
350 kill_tags.update(defs.frame_tags)
351 if self.forms:
352 remove_tags.add('form')
353 kill_tags.update(('button', 'input', 'select', 'textarea'))
354 if self.annoying_tags:
355 remove_tags.update(('blink', 'marquee'))
356
357 _remove = []
358 _kill = []
359 for el in doc.iter():
360 if el.tag in kill_tags:
361 if self.allow_element(el):
362 continue
363 _kill.append(el)
364 elif el.tag in remove_tags:
365 if self.allow_element(el):
366 continue
367 _remove.append(el)
368
369 if _remove and _remove[0] == doc:
370
371
372 el = _remove.pop(0)
373 el.tag = 'div'
374 el.attrib.clear()
375 elif _kill and _kill[0] == doc:
376
377
378 el = _kill.pop(0)
379 if el.tag != 'html':
380 el.tag = 'div'
381 el.clear()
382
383 _kill.reverse()
384 for el in _kill:
385 el.drop_tree()
386 for el in _remove:
387 el.drop_tag()
388
389 if self.remove_unknown_tags:
390 if allow_tags:
391 raise ValueError(
392 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
393 allow_tags = set(defs.tags)
394 if allow_tags:
395 bad = []
396 for el in doc.iter():
397 if el.tag not in allow_tags:
398 bad.append(el)
399 if bad:
400 if bad[0] is doc:
401 el = bad.pop(0)
402 el.tag = 'div'
403 el.attrib.clear()
404 for el in bad:
405 el.drop_tag()
406 if self.add_nofollow:
407 for el in _find_external_links(doc):
408 if not self.allow_follow(el):
409 rel = el.get('rel')
410 if rel:
411 if ('nofollow' in rel
412 and ' nofollow ' in (' %s ' % rel)):
413 continue
414 rel = '%s nofollow' % rel
415 else:
416 rel = 'nofollow'
417 el.set('rel', rel)
418
420 """
421 Override to suppress rel="nofollow" on some anchors.
422 """
423 return False
424
426 if el.tag not in self._tag_link_attrs:
427 return False
428 attr = self._tag_link_attrs[el.tag]
429 if isinstance(attr, (list, tuple)):
430 for one_attr in attr:
431 url = el.get(one_attr)
432 if not url:
433 return False
434 if not self.allow_embedded_url(el, url):
435 return False
436 return True
437 else:
438 url = el.get(attr)
439 if not url:
440 return False
441 return self.allow_embedded_url(el, url)
442
444 if (self.whitelist_tags is not None
445 and el.tag not in self.whitelist_tags):
446 return False
447 scheme, netloc, path, query, fragment = urlsplit(url)
448 netloc = netloc.lower().split(':', 1)[0]
449 if scheme not in ('http', 'https'):
450 return False
451 if netloc in self.host_whitelist:
452 return True
453 return False
454
465
467 bad = []
468 for el in doc.iter(iterate):
469 if condition(el):
470 bad.append(el)
471 for el in bad:
472 el.drop_tree()
473
481
482 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
483
485 """
486 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
487 can get interpreted, or ``expre/* stuff */ssion(...)``. This
488 checks for attempt to do stuff like this.
489
490 Typically the response will be to kill the entire style; if you
491 have just a bit of Javascript in the style another rule will catch
492 that and remove only the Javascript from the style; this catches
493 more sneaky attempts.
494 """
495 style = self._substitute_comments('', style)
496 style = style.replace('\\', '')
497 style = _substitute_whitespace('', style)
498 style = style.lower()
499 if 'javascript:' in style:
500 return True
501 if 'expression(' in style:
502 return True
503 return False
504
513
514 clean = Cleaner()
515 clean_html = clean.clean_html
516
517
518
519
520
521 _link_regexes = [
522 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
523
524 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
525 ]
526
527 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
528
529 _avoid_hosts = [
530 re.compile(r'^localhost', re.I),
531 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
532 re.compile(r'^127\.0\.0\.1$'),
533 ]
534
535 _avoid_classes = ['nolink']
536
541 """
542 Turn any URLs into links.
543
544 It will search for links identified by the given regular
545 expressions (by default mailto and http(s) links).
546
547 It won't link text in an element in avoid_elements, or an element
548 with a class in avoid_classes. It won't link to anything with a
549 host that matches one of the regular expressions in avoid_hosts
550 (default localhost and 127.0.0.1).
551
552 If you pass in an element, the element's tail will not be
553 substituted, only the contents of the element.
554 """
555 if el.tag in avoid_elements:
556 return
557 class_name = el.get('class')
558 if class_name:
559 class_name = class_name.split()
560 for match_class in avoid_classes:
561 if match_class in class_name:
562 return
563 for child in list(el):
564 autolink(child, link_regexes=link_regexes,
565 avoid_elements=avoid_elements,
566 avoid_hosts=avoid_hosts,
567 avoid_classes=avoid_classes)
568 if child.tail:
569 text, tail_children = _link_text(
570 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
571 if tail_children:
572 child.tail = text
573 index = el.index(child)
574 el[index+1:index+1] = tail_children
575 if el.text:
576 text, pre_children = _link_text(
577 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
578 if pre_children:
579 el.text = text
580 el[:0] = pre_children
581
582 -def _link_text(text, link_regexes, avoid_hosts, factory):
583 leading_text = ''
584 links = []
585 last_pos = 0
586 while 1:
587 best_match, best_pos = None, None
588 for regex in link_regexes:
589 regex_pos = last_pos
590 while 1:
591 match = regex.search(text, pos=regex_pos)
592 if match is None:
593 break
594 host = match.group('host')
595 for host_regex in avoid_hosts:
596 if host_regex.search(host):
597 regex_pos = match.end()
598 break
599 else:
600 break
601 if match is None:
602 continue
603 if best_pos is None or match.start() < best_pos:
604 best_match = match
605 best_pos = match.start()
606 if best_match is None:
607
608 if links:
609 assert not links[-1].tail
610 links[-1].tail = text
611 else:
612 assert not leading_text
613 leading_text = text
614 break
615 link = best_match.group(0)
616 end = best_match.end()
617 if link.endswith('.') or link.endswith(','):
618
619 end -= 1
620 link = link[:-1]
621 prev_text = text[:best_match.start()]
622 if links:
623 assert not links[-1].tail
624 links[-1].tail = prev_text
625 else:
626 assert not leading_text
627 leading_text = prev_text
628 anchor = factory('a')
629 anchor.set('href', link)
630 body = best_match.group('body')
631 if not body:
632 body = link
633 if body.endswith('.') or body.endswith(','):
634 body = body[:-1]
635 anchor.text = body
636 links.append(anchor)
637 text = text[end:]
638 return leading_text, links
639
648
649 autolink_html.__doc__ = autolink.__doc__
650
651
652
653
654
655 _avoid_word_break_elements = ['pre', 'textarea', 'code']
656 _avoid_word_break_classes = ['nobreak']
657
662 """
663 Breaks any long words found in the body of the text (not attributes).
664
665 Doesn't effect any of the tags in avoid_elements, by default
666 ``<textarea>`` and ``<pre>``
667
668 Breaks words by inserting ​, which is a unicode character
669 for Zero Width Space character. This generally takes up no space
670 in rendering, but does copy as a space, and in monospace contexts
671 usually takes up space.
672
673 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
674 """
675
676
677 if el.tag in _avoid_word_break_elements:
678 return
679 class_name = el.get('class')
680 if class_name:
681 dont_break = False
682 class_name = class_name.split()
683 for avoid in avoid_classes:
684 if avoid in class_name:
685 dont_break = True
686 break
687 if dont_break:
688 return
689 if el.text:
690 el.text = _break_text(el.text, max_width, break_character)
691 for child in el:
692 word_break(child, max_width=max_width,
693 avoid_elements=avoid_elements,
694 avoid_classes=avoid_classes,
695 break_character=break_character)
696 if child.tail:
697 child.tail = _break_text(child.tail, max_width, break_character)
698
704
705 -def _break_text(text, max_width, break_character):
706 words = text.split()
707 for word in words:
708 if len(word) > max_width:
709 replacement = _insert_break(word, max_width, break_character)
710 text = text.replace(word, replacement)
711 return text
712
713 _break_prefer_re = re.compile(r'[^a-z]', re.I)
714
716 orig_word = word
717 result = ''
718 while len(word) > width:
719 start = word[:width]
720 breaks = list(_break_prefer_re.finditer(start))
721 if breaks:
722 last_break = breaks[-1]
723
724 if last_break.end() > width-10:
725
726
727 start = word[:last_break.end()]
728 result += start + break_character
729 word = word[len(start):]
730 result += word
731 return result
732