1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _is_image_dataurl = re.compile(
74 r'^data:image/.+;base64', re.I).search
75 _is_possibly_malicious_scheme = re.compile(
76 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
77 re.I).search
82
83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
84
85
86
87 _conditional_comment_re = re.compile(
88 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
89
90 _find_styled_elements = etree.XPath(
91 "descendant-or-self::*[@style]")
92
93 _find_external_links = etree.XPath(
94 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
95 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
96 namespaces={'x':XHTML_NAMESPACE})
97
98
100 """
101 Instances cleans the document of each of the possible offending
102 elements. The cleaning is controlled by attributes; you can
103 override attributes in a subclass, or set them in the constructor.
104
105 ``scripts``:
106 Removes any ``<script>`` tags.
107
108 ``javascript``:
109 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
110 as they could contain Javascript.
111
112 ``comments``:
113 Removes any comments.
114
115 ``style``:
116 Removes any style tags.
117
118 ``inline_style``
119 Removes any style attributes. Defaults to the value of the ``style`` option.
120
121 ``links``:
122 Removes any ``<link>`` tags
123
124 ``meta``:
125 Removes any ``<meta>`` tags
126
127 ``page_structure``:
128 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
129
130 ``processing_instructions``:
131 Removes any processing instructions.
132
133 ``embedded``:
134 Removes any embedded objects (flash, iframes)
135
136 ``frames``:
137 Removes any frame-related tags
138
139 ``forms``:
140 Removes any form tags
141
142 ``annoying_tags``:
143 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
144
145 ``remove_tags``:
146 A list of tags to remove. Only the tags will be removed,
147 their content will get pulled up into the parent tag.
148
149 ``kill_tags``:
150 A list of tags to kill. Killing also removes the tag's content,
151 i.e. the whole subtree, not just the tag itself.
152
153 ``allow_tags``:
154 A list of tags to include (default include all).
155
156 ``remove_unknown_tags``:
157 Remove any tags that aren't standard parts of HTML.
158
159 ``safe_attrs_only``:
160 If true, only include 'safe' attributes (specifically the list
161 from the feedparser HTML sanitisation web site).
162
163 ``safe_attrs``:
164 A set of attribute names to override the default list of attributes
165 considered 'safe' (when safe_attrs_only=True).
166
167 ``add_nofollow``:
168 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
169
170 ``host_whitelist``:
171 A list or set of hosts that you can use for embedded content
172 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
173 You can also implement/override the method
174 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
175 implement more complex rules for what can be embedded.
176 Anything that passes this test will be shown, regardless of
177 the value of (for instance) ``embedded``.
178
179 Note that this parameter might not work as intended if you do not
180 make the links absolute before doing the cleaning.
181
182 Note that you may also need to set ``whitelist_tags``.
183
184 ``whitelist_tags``:
185 A set of tags that can be included with ``host_whitelist``.
186 The default is ``iframe`` and ``embed``; you may wish to
187 include other tags like ``script``, or you may want to
188 implement ``allow_embedded_url`` for more control. Set to None to
189 include all tags.
190
191 This modifies the document *in place*.
192 """
193
194 scripts = True
195 javascript = True
196 comments = True
197 style = False
198 inline_style = None
199 links = True
200 meta = True
201 page_structure = True
202 processing_instructions = True
203 embedded = True
204 frames = True
205 forms = True
206 annoying_tags = True
207 remove_tags = None
208 allow_tags = None
209 kill_tags = None
210 remove_unknown_tags = True
211 safe_attrs_only = True
212 safe_attrs = defs.safe_attrs
213 add_nofollow = False
214 host_whitelist = ()
215 whitelist_tags = set(['iframe', 'embed'])
216
225
226
227
228 _tag_link_attrs = dict(
229 script='src',
230 link='href',
231
232
233 applet=['code', 'object'],
234 iframe='src',
235 embed='src',
236 layer='src',
237
238
239
240
241
242
243
244
245 a='href',
246 )
247
249 """
250 Cleans the document.
251 """
252 if hasattr(doc, 'getroot'):
253
254 doc = doc.getroot()
255
256 xhtml_to_html(doc)
257
258
259 for el in doc.iter('image'):
260 el.tag = 'img'
261 if not self.comments:
262
263
264 self.kill_conditional_comments(doc)
265
266 kill_tags = set(self.kill_tags or ())
267 remove_tags = set(self.remove_tags or ())
268 allow_tags = set(self.allow_tags or ())
269
270 if self.scripts:
271 kill_tags.add('script')
272 if self.safe_attrs_only:
273 safe_attrs = set(self.safe_attrs)
274 for el in doc.iter(etree.Element):
275 attrib = el.attrib
276 for aname in attrib.keys():
277 if aname not in safe_attrs:
278 del attrib[aname]
279 if self.javascript:
280 if not (self.safe_attrs_only and
281 self.safe_attrs == defs.safe_attrs):
282
283 for el in doc.iter(etree.Element):
284 attrib = el.attrib
285 for aname in attrib.keys():
286 if aname.startswith('on'):
287 del attrib[aname]
288 doc.rewrite_links(self._remove_javascript_link,
289 resolve_base_href=False)
290
291
292 if not self.inline_style:
293 for el in _find_styled_elements(doc):
294 old = el.get('style')
295 new = _css_javascript_re.sub('', old)
296 new = _css_import_re.sub('', new)
297 if self._has_sneaky_javascript(new):
298
299 del el.attrib['style']
300 elif new != old:
301 el.set('style', new)
302 if not self.style:
303 for el in list(doc.iter('style')):
304 if el.get('type', '').lower().strip() == 'text/javascript':
305 el.drop_tree()
306 continue
307 old = el.text or ''
308 new = _css_javascript_re.sub('', old)
309
310 new = _css_import_re.sub('', old)
311 if self._has_sneaky_javascript(new):
312
313 el.text = '/* deleted */'
314 elif new != old:
315 el.text = new
316 if self.comments or self.processing_instructions:
317
318
319
320 kill_tags.add(etree.Comment)
321 if self.processing_instructions:
322 kill_tags.add(etree.ProcessingInstruction)
323 if self.style:
324 kill_tags.add('style')
325 if self.inline_style:
326 etree.strip_attributes(doc, 'style')
327 if self.links:
328 kill_tags.add('link')
329 elif self.style or self.javascript:
330
331
332 for el in list(doc.iter('link')):
333 if 'stylesheet' in el.get('rel', '').lower():
334
335 if not self.allow_element(el):
336 el.drop_tree()
337 if self.meta:
338 kill_tags.add('meta')
339 if self.page_structure:
340 remove_tags.update(('head', 'html', 'title'))
341 if self.embedded:
342
343
344
345 for el in list(doc.iter('param')):
346 found_parent = False
347 parent = el.getparent()
348 while parent is not None and parent.tag not in ('applet', 'object'):
349 parent = parent.getparent()
350 if parent is None:
351 el.drop_tree()
352 kill_tags.update(('applet',))
353
354 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
355 if self.frames:
356
357
358
359 kill_tags.update(defs.frame_tags)
360 if self.forms:
361 remove_tags.add('form')
362 kill_tags.update(('button', 'input', 'select', 'textarea'))
363 if self.annoying_tags:
364 remove_tags.update(('blink', 'marquee'))
365
366 _remove = []
367 _kill = []
368 for el in doc.iter():
369 if el.tag in kill_tags:
370 if self.allow_element(el):
371 continue
372 _kill.append(el)
373 elif el.tag in remove_tags:
374 if self.allow_element(el):
375 continue
376 _remove.append(el)
377
378 if _remove and _remove[0] == doc:
379
380
381 el = _remove.pop(0)
382 el.tag = 'div'
383 el.attrib.clear()
384 elif _kill and _kill[0] == doc:
385
386
387 el = _kill.pop(0)
388 if el.tag != 'html':
389 el.tag = 'div'
390 el.clear()
391
392 _kill.reverse()
393 for el in _kill:
394 el.drop_tree()
395 for el in _remove:
396 el.drop_tag()
397
398 if self.remove_unknown_tags:
399 if allow_tags:
400 raise ValueError(
401 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
402 allow_tags = set(defs.tags)
403 if allow_tags:
404 bad = []
405 for el in doc.iter():
406 if el.tag not in allow_tags:
407 bad.append(el)
408 if bad:
409 if bad[0] is doc:
410 el = bad.pop(0)
411 el.tag = 'div'
412 el.attrib.clear()
413 for el in bad:
414 el.drop_tag()
415 if self.add_nofollow:
416 for el in _find_external_links(doc):
417 if not self.allow_follow(el):
418 rel = el.get('rel')
419 if rel:
420 if ('nofollow' in rel
421 and ' nofollow ' in (' %s ' % rel)):
422 continue
423 rel = '%s nofollow' % rel
424 else:
425 rel = 'nofollow'
426 el.set('rel', rel)
427
429 """
430 Override to suppress rel="nofollow" on some anchors.
431 """
432 return False
433
435 if el.tag not in self._tag_link_attrs:
436 return False
437 attr = self._tag_link_attrs[el.tag]
438 if isinstance(attr, (list, tuple)):
439 for one_attr in attr:
440 url = el.get(one_attr)
441 if not url:
442 return False
443 if not self.allow_embedded_url(el, url):
444 return False
445 return True
446 else:
447 url = el.get(attr)
448 if not url:
449 return False
450 return self.allow_embedded_url(el, url)
451
453 if (self.whitelist_tags is not None
454 and el.tag not in self.whitelist_tags):
455 return False
456 scheme, netloc, path, query, fragment = urlsplit(url)
457 netloc = netloc.lower().split(':', 1)[0]
458 if scheme not in ('http', 'https'):
459 return False
460 if netloc in self.host_whitelist:
461 return True
462 return False
463
474
476 bad = []
477 for el in doc.iter(iterate):
478 if condition(el):
479 bad.append(el)
480 for el in bad:
481 el.drop_tree()
482
490
491 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
492
494 """
495 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
496 can get interpreted, or ``expre/* stuff */ssion(...)``. This
497 checks for attempt to do stuff like this.
498
499 Typically the response will be to kill the entire style; if you
500 have just a bit of Javascript in the style another rule will catch
501 that and remove only the Javascript from the style; this catches
502 more sneaky attempts.
503 """
504 style = self._substitute_comments('', style)
505 style = style.replace('\\', '')
506 style = _substitute_whitespace('', style)
507 style = style.lower()
508 if 'javascript:' in style:
509 return True
510 if 'expression(' in style:
511 return True
512 return False
513
522
523 clean = Cleaner()
524 clean_html = clean.clean_html
525
526
527
528
529
530 _link_regexes = [
531 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
532
533 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
534 ]
535
536 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
537
538 _avoid_hosts = [
539 re.compile(r'^localhost', re.I),
540 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
541 re.compile(r'^127\.0\.0\.1$'),
542 ]
543
544 _avoid_classes = ['nolink']
545
550 """
551 Turn any URLs into links.
552
553 It will search for links identified by the given regular
554 expressions (by default mailto and http(s) links).
555
556 It won't link text in an element in avoid_elements, or an element
557 with a class in avoid_classes. It won't link to anything with a
558 host that matches one of the regular expressions in avoid_hosts
559 (default localhost and 127.0.0.1).
560
561 If you pass in an element, the element's tail will not be
562 substituted, only the contents of the element.
563 """
564 if el.tag in avoid_elements:
565 return
566 class_name = el.get('class')
567 if class_name:
568 class_name = class_name.split()
569 for match_class in avoid_classes:
570 if match_class in class_name:
571 return
572 for child in list(el):
573 autolink(child, link_regexes=link_regexes,
574 avoid_elements=avoid_elements,
575 avoid_hosts=avoid_hosts,
576 avoid_classes=avoid_classes)
577 if child.tail:
578 text, tail_children = _link_text(
579 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
580 if tail_children:
581 child.tail = text
582 index = el.index(child)
583 el[index+1:index+1] = tail_children
584 if el.text:
585 text, pre_children = _link_text(
586 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
587 if pre_children:
588 el.text = text
589 el[:0] = pre_children
590
591 -def _link_text(text, link_regexes, avoid_hosts, factory):
592 leading_text = ''
593 links = []
594 last_pos = 0
595 while 1:
596 best_match, best_pos = None, None
597 for regex in link_regexes:
598 regex_pos = last_pos
599 while 1:
600 match = regex.search(text, pos=regex_pos)
601 if match is None:
602 break
603 host = match.group('host')
604 for host_regex in avoid_hosts:
605 if host_regex.search(host):
606 regex_pos = match.end()
607 break
608 else:
609 break
610 if match is None:
611 continue
612 if best_pos is None or match.start() < best_pos:
613 best_match = match
614 best_pos = match.start()
615 if best_match is None:
616
617 if links:
618 assert not links[-1].tail
619 links[-1].tail = text
620 else:
621 assert not leading_text
622 leading_text = text
623 break
624 link = best_match.group(0)
625 end = best_match.end()
626 if link.endswith('.') or link.endswith(','):
627
628 end -= 1
629 link = link[:-1]
630 prev_text = text[:best_match.start()]
631 if links:
632 assert not links[-1].tail
633 links[-1].tail = prev_text
634 else:
635 assert not leading_text
636 leading_text = prev_text
637 anchor = factory('a')
638 anchor.set('href', link)
639 body = best_match.group('body')
640 if not body:
641 body = link
642 if body.endswith('.') or body.endswith(','):
643 body = body[:-1]
644 anchor.text = body
645 links.append(anchor)
646 text = text[end:]
647 return leading_text, links
648
657
658 autolink_html.__doc__ = autolink.__doc__
659
660
661
662
663
664 _avoid_word_break_elements = ['pre', 'textarea', 'code']
665 _avoid_word_break_classes = ['nobreak']
666
671 """
672 Breaks any long words found in the body of the text (not attributes).
673
674 Doesn't effect any of the tags in avoid_elements, by default
675 ``<textarea>`` and ``<pre>``
676
677 Breaks words by inserting ​, which is a unicode character
678 for Zero Width Space character. This generally takes up no space
679 in rendering, but does copy as a space, and in monospace contexts
680 usually takes up space.
681
682 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
683 """
684
685
686 if el.tag in _avoid_word_break_elements:
687 return
688 class_name = el.get('class')
689 if class_name:
690 dont_break = False
691 class_name = class_name.split()
692 for avoid in avoid_classes:
693 if avoid in class_name:
694 dont_break = True
695 break
696 if dont_break:
697 return
698 if el.text:
699 el.text = _break_text(el.text, max_width, break_character)
700 for child in el:
701 word_break(child, max_width=max_width,
702 avoid_elements=avoid_elements,
703 avoid_classes=avoid_classes,
704 break_character=break_character)
705 if child.tail:
706 child.tail = _break_text(child.tail, max_width, break_character)
707
713
714 -def _break_text(text, max_width, break_character):
715 words = text.split()
716 for word in words:
717 if len(word) > max_width:
718 replacement = _insert_break(word, max_width, break_character)
719 text = text.replace(word, replacement)
720 return text
721
722 _break_prefer_re = re.compile(r'[^a-z]', re.I)
723
725 orig_word = word
726 result = ''
727 while len(word) > width:
728 start = word[:width]
729 breaks = list(_break_prefer_re.finditer(start))
730 if breaks:
731 last_break = breaks[-1]
732
733 if last_break.end() > width-10:
734
735
736 start = word[:last_break.end()]
737 result += start + break_character
738 word = word[len(start):]
739 result += word
740 return result
741