1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _javascript_scheme_re = re.compile(
74 r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
75 _substitute_whitespace = re.compile(r'\s+').sub
76
77
78
79 _conditional_comment_re = re.compile(
80 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
81
82 _find_styled_elements = etree.XPath(
83 "descendant-or-self::*[@style]")
84
85 _find_external_links = etree.XPath(
86 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
87 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
88 namespaces={'x':XHTML_NAMESPACE})
89
91 """
92 Instances cleans the document of each of the possible offending
93 elements. The cleaning is controlled by attributes; you can
94 override attributes in a subclass, or set them in the constructor.
95
96 ``scripts``:
97 Removes any ``<script>`` tags.
98
99 ``javascript``:
100 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
101 as they could contain Javascript.
102
103 ``comments``:
104 Removes any comments.
105
106 ``style``:
107 Removes any style tags or attributes.
108
109 ``links``:
110 Removes any ``<link>`` tags
111
112 ``meta``:
113 Removes any ``<meta>`` tags
114
115 ``page_structure``:
116 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
117
118 ``processing_instructions``:
119 Removes any processing instructions.
120
121 ``embedded``:
122 Removes any embedded objects (flash, iframes)
123
124 ``frames``:
125 Removes any frame-related tags
126
127 ``forms``:
128 Removes any form tags
129
130 ``annoying_tags``:
131 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
132
133 ``remove_tags``:
134 A list of tags to remove. Only the tags will be removed,
135 their content will get pulled up into the parent tag.
136
137 ``kill_tags``:
138 A list of tags to kill. Killing also removes the tag's content,
139 i.e. the whole subtree, not just the tag itself.
140
141 ``allow_tags``:
142 A list of tags to include (default include all).
143
144 ``remove_unknown_tags``:
145 Remove any tags that aren't standard parts of HTML.
146
147 ``safe_attrs_only``:
148 If true, only include 'safe' attributes (specifically the list
149 from the feedparser HTML sanitisation web site).
150
151 ``safe_attrs``:
152 A set of attribute names to override the default list of attributes
153 considered 'safe' (when safe_attrs_only=True).
154
155 ``add_nofollow``:
156 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
157
158 ``host_whitelist``:
159 A list or set of hosts that you can use for embedded content
160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
161 You can also implement/override the method
162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
163 implement more complex rules for what can be embedded.
164 Anything that passes this test will be shown, regardless of
165 the value of (for instance) ``embedded``.
166
167 Note that this parameter might not work as intended if you do not
168 make the links absolute before doing the cleaning.
169
170 Note that you may also need to set ``whitelist_tags``.
171
172 ``whitelist_tags``:
173 A set of tags that can be included with ``host_whitelist``.
174 The default is ``iframe`` and ``embed``; you may wish to
175 include other tags like ``script``, or you may want to
176 implement ``allow_embedded_url`` for more control. Set to None to
177 include all tags.
178
179 This modifies the document *in place*.
180 """
181
182 scripts = True
183 javascript = True
184 comments = True
185 style = False
186 links = True
187 meta = True
188 page_structure = True
189 processing_instructions = True
190 embedded = True
191 frames = True
192 forms = True
193 annoying_tags = True
194 remove_tags = None
195 allow_tags = None
196 kill_tags = None
197 remove_unknown_tags = True
198 safe_attrs_only = True
199 safe_attrs = defs.safe_attrs
200 add_nofollow = False
201 host_whitelist = ()
202 whitelist_tags = set(['iframe', 'embed'])
203
210
211
212
213 _tag_link_attrs = dict(
214 script='src',
215 link='href',
216
217
218 applet=['code', 'object'],
219 iframe='src',
220 embed='src',
221 layer='src',
222
223
224
225
226
227
228
229
230 a='href',
231 )
232
234 """
235 Cleans the document.
236 """
237 if hasattr(doc, 'getroot'):
238
239 doc = doc.getroot()
240
241 xhtml_to_html(doc)
242
243
244 for el in doc.iter('image'):
245 el.tag = 'img'
246 if not self.comments:
247
248
249 self.kill_conditional_comments(doc)
250
251 kill_tags = set(self.kill_tags or ())
252 remove_tags = set(self.remove_tags or ())
253 allow_tags = set(self.allow_tags or ())
254
255 if self.scripts:
256 kill_tags.add('script')
257 if self.safe_attrs_only:
258 safe_attrs = set(self.safe_attrs)
259 for el in doc.iter():
260 attrib = el.attrib
261 for aname in attrib.keys():
262 if aname not in safe_attrs:
263 del attrib[aname]
264 if self.javascript:
265 if not (self.safe_attrs_only and
266 self.safe_attrs == defs.safe_attrs):
267
268 for el in doc.iter():
269 attrib = el.attrib
270 for aname in attrib.keys():
271 if aname.startswith('on'):
272 del attrib[aname]
273 doc.rewrite_links(self._remove_javascript_link,
274 resolve_base_href=False)
275 if not self.style:
276
277
278 for el in _find_styled_elements(doc):
279 old = el.get('style')
280 new = _css_javascript_re.sub('', old)
281 new = _css_import_re.sub('', new)
282 if self._has_sneaky_javascript(new):
283
284 del el.attrib['style']
285 elif new != old:
286 el.set('style', new)
287 for el in list(doc.iter('style')):
288 if el.get('type', '').lower().strip() == 'text/javascript':
289 el.drop_tree()
290 continue
291 old = el.text or ''
292 new = _css_javascript_re.sub('', old)
293
294 new = _css_import_re.sub('', old)
295 if self._has_sneaky_javascript(new):
296
297 el.text = '/* deleted */'
298 elif new != old:
299 el.text = new
300 if self.comments or self.processing_instructions:
301
302
303
304 kill_tags.add(etree.Comment)
305 if self.processing_instructions:
306 kill_tags.add(etree.ProcessingInstruction)
307 if self.style:
308 kill_tags.add('style')
309 etree.strip_attributes(doc, 'style')
310 if self.links:
311 kill_tags.add('link')
312 elif self.style or self.javascript:
313
314
315 for el in list(doc.iter('link')):
316 if 'stylesheet' in el.get('rel', '').lower():
317
318 if not self.allow_element(el):
319 el.drop_tree()
320 if self.meta:
321 kill_tags.add('meta')
322 if self.page_structure:
323 remove_tags.update(('head', 'html', 'title'))
324 if self.embedded:
325
326
327
328 for el in list(doc.iter('param')):
329 found_parent = False
330 parent = el.getparent()
331 while parent is not None and parent.tag not in ('applet', 'object'):
332 parent = parent.getparent()
333 if parent is None:
334 el.drop_tree()
335 kill_tags.update(('applet',))
336
337 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
338 if self.frames:
339
340
341
342 kill_tags.update(defs.frame_tags)
343 if self.forms:
344 remove_tags.add('form')
345 kill_tags.update(('button', 'input', 'select', 'textarea'))
346 if self.annoying_tags:
347 remove_tags.update(('blink', 'marquee'))
348
349 _remove = []
350 _kill = []
351 for el in doc.iter():
352 if el.tag in kill_tags:
353 if self.allow_element(el):
354 continue
355 _kill.append(el)
356 elif el.tag in remove_tags:
357 if self.allow_element(el):
358 continue
359 _remove.append(el)
360
361 if _remove and _remove[0] == doc:
362
363
364 el = _remove.pop(0)
365 el.tag = 'div'
366 el.attrib.clear()
367 elif _kill and _kill[0] == doc:
368
369
370 el = _kill.pop(0)
371 if el.tag != 'html':
372 el.tag = 'div'
373 el.clear()
374
375 _kill.reverse()
376 for el in _kill:
377 el.drop_tree()
378 for el in _remove:
379 el.drop_tag()
380
381 allow_tags = self.allow_tags
382 if self.remove_unknown_tags:
383 if allow_tags:
384 raise ValueError(
385 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
386 allow_tags = set(defs.tags)
387 if allow_tags:
388 bad = []
389 for el in doc.iter():
390 if el.tag not in allow_tags:
391 bad.append(el)
392 if bad:
393 if bad[0] is doc:
394 el = bad.pop(0)
395 el.tag = 'div'
396 el.attrib.clear()
397 for el in bad:
398 el.drop_tag()
399 if self.add_nofollow:
400 for el in _find_external_links(doc):
401 if not self.allow_follow(el):
402 rel = el.get('rel')
403 if rel:
404 if ('nofollow' in rel
405 and ' nofollow ' in (' %s ' % rel)):
406 continue
407 rel = '%s nofollow' % rel
408 else:
409 rel = 'nofollow'
410 el.set('rel', rel)
411
413 """
414 Override to suppress rel="nofollow" on some anchors.
415 """
416 return False
417
419 if el.tag not in self._tag_link_attrs:
420 return False
421 attr = self._tag_link_attrs[el.tag]
422 if isinstance(attr, (list, tuple)):
423 for one_attr in attr:
424 url = el.get(one_attr)
425 if not url:
426 return False
427 if not self.allow_embedded_url(el, url):
428 return False
429 return True
430 else:
431 url = el.get(attr)
432 if not url:
433 return False
434 return self.allow_embedded_url(el, url)
435
437 if (self.whitelist_tags is not None
438 and el.tag not in self.whitelist_tags):
439 return False
440 scheme, netloc, path, query, fragment = urlsplit(url)
441 netloc = netloc.lower().split(':', 1)[0]
442 if scheme not in ('http', 'https'):
443 return False
444 if netloc in self.host_whitelist:
445 return True
446 return False
447
458
460 bad = []
461 for el in doc.iter(iterate):
462 if condition(el):
463 bad.append(el)
464 for el in bad:
465 el.drop_tree()
466
474
475 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
476
478 """
479 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
480 can get interpreted, or ``expre/* stuff */ssion(...)``. This
481 checks for attempt to do stuff like this.
482
483 Typically the response will be to kill the entire style; if you
484 have just a bit of Javascript in the style another rule will catch
485 that and remove only the Javascript from the style; this catches
486 more sneaky attempts.
487 """
488 style = self._substitute_comments('', style)
489 style = style.replace('\\', '')
490 style = _substitute_whitespace('', style)
491 style = style.lower()
492 if 'javascript:' in style:
493 return True
494 if 'expression(' in style:
495 return True
496 return False
497
506
507 clean = Cleaner()
508 clean_html = clean.clean_html
509
510
511
512
513
514 _link_regexes = [
515 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
516
517 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
518 ]
519
520 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
521
522 _avoid_hosts = [
523 re.compile(r'^localhost', re.I),
524 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
525 re.compile(r'^127\.0\.0\.1$'),
526 ]
527
528 _avoid_classes = ['nolink']
529
534 """
535 Turn any URLs into links.
536
537 It will search for links identified by the given regular
538 expressions (by default mailto and http(s) links).
539
540 It won't link text in an element in avoid_elements, or an element
541 with a class in avoid_classes. It won't link to anything with a
542 host that matches one of the regular expressions in avoid_hosts
543 (default localhost and 127.0.0.1).
544
545 If you pass in an element, the element's tail will not be
546 substituted, only the contents of the element.
547 """
548 if el.tag in avoid_elements:
549 return
550 class_name = el.get('class')
551 if class_name:
552 class_name = class_name.split()
553 for match_class in avoid_classes:
554 if match_class in class_name:
555 return
556 for child in list(el):
557 autolink(child, link_regexes=link_regexes,
558 avoid_elements=avoid_elements,
559 avoid_hosts=avoid_hosts,
560 avoid_classes=avoid_classes)
561 if child.tail:
562 text, tail_children = _link_text(
563 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
564 if tail_children:
565 child.tail = text
566 index = el.index(child)
567 el[index+1:index+1] = tail_children
568 if el.text:
569 text, pre_children = _link_text(
570 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
571 if pre_children:
572 el.text = text
573 el[:0] = pre_children
574
575 -def _link_text(text, link_regexes, avoid_hosts, factory):
576 leading_text = ''
577 links = []
578 last_pos = 0
579 while 1:
580 best_match, best_pos = None, None
581 for regex in link_regexes:
582 regex_pos = last_pos
583 while 1:
584 match = regex.search(text, pos=regex_pos)
585 if match is None:
586 break
587 host = match.group('host')
588 for host_regex in avoid_hosts:
589 if host_regex.search(host):
590 regex_pos = match.end()
591 break
592 else:
593 break
594 if match is None:
595 continue
596 if best_pos is None or match.start() < best_pos:
597 best_match = match
598 best_pos = match.start()
599 if best_match is None:
600
601 if links:
602 assert not links[-1].tail
603 links[-1].tail = text
604 else:
605 assert not leading_text
606 leading_text = text
607 break
608 link = best_match.group(0)
609 end = best_match.end()
610 if link.endswith('.') or link.endswith(','):
611
612 end -= 1
613 link = link[:-1]
614 prev_text = text[:best_match.start()]
615 if links:
616 assert not links[-1].tail
617 links[-1].tail = prev_text
618 else:
619 assert not leading_text
620 leading_text = prev_text
621 anchor = factory('a')
622 anchor.set('href', link)
623 body = best_match.group('body')
624 if not body:
625 body = link
626 if body.endswith('.') or body.endswith(','):
627 body = body[:-1]
628 anchor.text = body
629 links.append(anchor)
630 text = text[end:]
631 return leading_text, links
632
641
642 autolink_html.__doc__ = autolink.__doc__
643
644
645
646
647
648 _avoid_word_break_elements = ['pre', 'textarea', 'code']
649 _avoid_word_break_classes = ['nobreak']
650
655 """
656 Breaks any long words found in the body of the text (not attributes).
657
658 Doesn't effect any of the tags in avoid_elements, by default
659 ``<textarea>`` and ``<pre>``
660
661 Breaks words by inserting ​, which is a unicode character
662 for Zero Width Space character. This generally takes up no space
663 in rendering, but does copy as a space, and in monospace contexts
664 usually takes up space.
665
666 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
667 """
668
669
670 if el.tag in _avoid_word_break_elements:
671 return
672 class_name = el.get('class')
673 if class_name:
674 dont_break = False
675 class_name = class_name.split()
676 for avoid in avoid_classes:
677 if avoid in class_name:
678 dont_break = True
679 break
680 if dont_break:
681 return
682 if el.text:
683 el.text = _break_text(el.text, max_width, break_character)
684 for child in el:
685 word_break(child, max_width=max_width,
686 avoid_elements=avoid_elements,
687 avoid_classes=avoid_classes,
688 break_character=break_character)
689 if child.tail:
690 child.tail = _break_text(child.tail, max_width, break_character)
691
697
698 -def _break_text(text, max_width, break_character):
699 words = text.split()
700 for word in words:
701 if len(word) > max_width:
702 replacement = _insert_break(word, max_width, break_character)
703 text = text.replace(word, replacement)
704 return text
705
706 _break_prefer_re = re.compile(r'[^a-z]', re.I)
707
709 orig_word = word
710 result = ''
711 while len(word) > width:
712 start = word[:width]
713 breaks = list(_break_prefer_re.finditer(start))
714 if breaks:
715 last_break = breaks[-1]
716
717 if last_break.end() > width-10:
718
719
720 start = word[:last_break.end()]
721 result += start + break_character
722 word = word[len(start):]
723 result += word
724 return result
725