1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import _nons, _transform_result
18
19 try:
20 set
21 except NameError:
22
23 from sets import Set as set
24
25 try:
26 unichr = __builtins__['unichr']
27 except (NameError, KeyError):
28
29 unichr = chr
30
31 try:
32 unicode = __builtins__['unicode']
33 except (NameError, KeyError):
34
35 unicode = str
36
37 try:
38 bytes = __builtins__['bytes']
39 except (NameError, KeyError):
40
41 bytes = str
42
43 try:
44 basestring = __builtins__['basestring']
45 except (NameError, KeyError):
46 basestring = (str, bytes)
47
48
49 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
50 'word_break', 'word_break_html']
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 _css_javascript_re = re.compile(
74 r'expression\s*\(.*?\)', re.S|re.I)
75
76
77 _css_import_re = re.compile(
78 r'@\s*import', re.I)
79
80
81
82 _javascript_scheme_re = re.compile(
83 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
84 _substitute_whitespace = re.compile(r'\s+').sub
85
86
87
88 _conditional_comment_re = re.compile(
89 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
90
91 _find_styled_elements = etree.XPath(
92 "descendant-or-self::*[@style]")
93
94 _find_external_links = etree.XPath(
95 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
96 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
97 namespaces={'x':XHTML_NAMESPACE})
98
100 """
101 Instances cleans the document of each of the possible offending
102 elements. The cleaning is controlled by attributes; you can
103 override attributes in a subclass, or set them in the constructor.
104
105 ``scripts``:
106 Removes any ``<script>`` tags.
107
108 ``javascript``:
109 Removes any Javascript, like an ``onclick`` attribute.
110
111 ``comments``:
112 Removes any comments.
113
114 ``style``:
115 Removes any style tags or attributes.
116
117 ``links``:
118 Removes any ``<link>`` tags
119
120 ``meta``:
121 Removes any ``<meta>`` tags
122
123 ``page_structure``:
124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
125
126 ``processing_instructions``:
127 Removes any processing instructions.
128
129 ``embedded``:
130 Removes any embedded objects (flash, iframes)
131
132 ``frames``:
133 Removes any frame-related tags
134
135 ``forms``:
136 Removes any form tags
137
138 ``annoying_tags``:
139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
140
141 ``remove_tags``:
142 A list of tags to remove.
143
144 ``allow_tags``:
145 A list of tags to include (default include all).
146
147 ``remove_unknown_tags``:
148 Remove any tags that aren't standard parts of HTML.
149
150 ``safe_attrs_only``:
151 If true, only include 'safe' attributes (specifically the list
152 from `feedparser
153 <http://feedparser.org/docs/html-sanitization.html>`_).
154
155 ``add_nofollow``:
156 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
157
158 ``host_whitelist``:
159 A list or set of hosts that you can use for embedded content
160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
161 You can also implement/override the method
162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
163 implement more complex rules for what can be embedded.
164 Anything that passes this test will be shown, regardless of
165 the value of (for instance) ``embedded``.
166
167 Note that this parameter might not work as intended if you do not
168 make the links absolute before doing the cleaning.
169
170 ``whitelist_tags``:
171 A set of tags that can be included with ``host_whitelist``.
172 The default is ``iframe`` and ``embed``; you may wish to
173 include other tags like ``script``, or you may want to
174 implement ``allow_embedded_url`` for more control. Set to None to
175 include all tags.
176
177 This modifies the document *in place*.
178 """
179
180 scripts = True
181 javascript = True
182 comments = True
183 style = False
184 links = True
185 meta = True
186 page_structure = True
187 processing_instructions = True
188 embedded = True
189 frames = True
190 forms = True
191 annoying_tags = True
192 remove_tags = None
193 allow_tags = None
194 remove_unknown_tags = True
195 safe_attrs_only = True
196 add_nofollow = False
197 host_whitelist = ()
198 whitelist_tags = set(['iframe', 'embed'])
199
206
207
208
209 _tag_link_attrs = dict(
210 script='src',
211 link='href',
212
213
214 applet=['code', 'object'],
215 iframe='src',
216 embed='src',
217 layer='src',
218
219
220
221
222
223
224
225
226 a='href',
227 )
228
230 """
231 Cleans the document.
232 """
233 if hasattr(doc, 'getroot'):
234
235 doc = doc.getroot()
236
237 for el in doc.iter():
238 tag = el.tag
239 if isinstance(tag, basestring):
240 el.tag = _nons(tag)
241
242
243 for el in doc.iter('image'):
244 el.tag = 'img'
245 if not self.comments:
246
247
248 self.kill_conditional_comments(doc)
249 kill_tags = set()
250 remove_tags = set(self.remove_tags or ())
251 if self.allow_tags:
252 allow_tags = set(self.allow_tags)
253 else:
254 allow_tags = set()
255 if self.scripts:
256 kill_tags.add('script')
257 if self.safe_attrs_only:
258 safe_attrs = set(defs.safe_attrs)
259 for el in doc.iter():
260 attrib = el.attrib
261 for aname in attrib.keys():
262 if aname not in safe_attrs:
263 del attrib[aname]
264 if self.javascript:
265 if not self.safe_attrs_only:
266
267 for el in doc.iter():
268 attrib = el.attrib
269 for aname in attrib.keys():
270 if aname.startswith('on'):
271 del attrib[aname]
272 doc.rewrite_links(self._remove_javascript_link,
273 resolve_base_href=False)
274 if not self.style:
275
276
277 for el in _find_styled_elements(doc):
278 old = el.get('style')
279 new = _css_javascript_re.sub('', old)
280 new = _css_import_re.sub('', old)
281 if self._has_sneaky_javascript(new):
282
283 del el.attrib['style']
284 elif new != old:
285 el.set('style', new)
286 for el in list(doc.iter('style')):
287 if el.get('type', '').lower().strip() == 'text/javascript':
288 el.drop_tree()
289 continue
290 old = el.text or ''
291 new = _css_javascript_re.sub('', old)
292
293 new = _css_import_re.sub('', old)
294 if self._has_sneaky_javascript(new):
295
296 el.text = '/* deleted */'
297 elif new != old:
298 el.text = new
299 if self.comments or self.processing_instructions:
300
301
302
303 kill_tags.add(etree.Comment)
304 if self.processing_instructions:
305 kill_tags.add(etree.ProcessingInstruction)
306 if self.style:
307 kill_tags.add('style')
308 for el in _find_styled_elements(doc):
309 del el.attrib['style']
310 if self.links:
311 kill_tags.add('link')
312 elif self.style or self.javascript:
313
314
315 for el in list(doc.iter('link')):
316 if 'stylesheet' in el.get('rel', '').lower():
317
318 el.drop_tree()
319 if self.meta:
320 kill_tags.add('meta')
321 if self.page_structure:
322 remove_tags.update(('head', 'html', 'title'))
323 if self.embedded:
324
325
326
327 for el in list(doc.iter('param')):
328 found_parent = False
329 parent = el.getparent()
330 while parent is not None and parent.tag not in ('applet', 'object'):
331 parent = parent.getparent()
332 if parent is None:
333 el.drop_tree()
334 kill_tags.update(('applet',))
335
336 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
337 if self.frames:
338
339
340
341 kill_tags.update(defs.frame_tags)
342 if self.forms:
343 remove_tags.add('form')
344 kill_tags.update(('button', 'input', 'select', 'textarea'))
345 if self.annoying_tags:
346 remove_tags.update(('blink', 'marque'))
347
348 _remove = []
349 _kill = []
350 for el in doc.iter():
351 if el.tag in kill_tags:
352 if self.allow_element(el):
353 continue
354 _kill.append(el)
355 elif el.tag in remove_tags:
356 if self.allow_element(el):
357 continue
358 _remove.append(el)
359
360 if _remove and _remove[0] == doc:
361
362
363 el = _remove.pop(0)
364 el.tag = 'div'
365 el.attrib.clear()
366 elif _kill and _kill[0] == doc:
367
368
369 el = _kill.pop(0)
370 if el.tag != 'html':
371 el.tag = 'div'
372 el.clear()
373
374 for el in _kill:
375 el.drop_tree()
376 for el in _remove:
377 el.drop_tag()
378
379 allow_tags = self.allow_tags
380 if self.remove_unknown_tags:
381 if allow_tags:
382 raise ValueError(
383 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
384 allow_tags = set(defs.tags)
385 if allow_tags:
386 bad = []
387 for el in doc.iter():
388 if el.tag not in allow_tags:
389 bad.append(el)
390 for el in bad:
391 el.drop_tag()
392 if self.add_nofollow:
393 for el in _find_external_links(doc):
394 if not self.allow_follow(el):
395 el.set('rel', 'nofollow')
396
398 """
399 Override to suppress rel="nofollow" on some anchors.
400 """
401 return False
402
404 if el.tag not in self._tag_link_attrs:
405 return False
406 attr = self._tag_link_attrs[el.tag]
407 if isinstance(attr, (list, tuple)):
408 for one_attr in attr:
409 url = el.get(one_attr)
410 if not url:
411 return False
412 if not self.allow_embedded_url(el, url):
413 return False
414 return True
415 else:
416 url = el.get(attr)
417 if not url:
418 return False
419 return self.allow_embedded_url(el, url)
420
422 if (self.whitelist_tags is not None
423 and el.tag not in self.whitelist_tags):
424 return False
425 scheme, netloc, path, query, fragment = urlsplit(url)
426 netloc = netloc.lower().split(':', 1)[0]
427 if scheme not in ('http', 'https'):
428 return False
429 if netloc in self.host_whitelist:
430 return True
431 return False
432
443
445 bad = []
446 for el in doc.iter(iterate):
447 if condition(el):
448 bad.append(el)
449 for el in bad:
450 el.drop_tree()
451
459
460 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
461
463 """
464 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
465 can get interpreted, or ``expre/* stuff */ssion(...)``. This
466 checks for attempt to do stuff like this.
467
468 Typically the response will be to kill the entire style; if you
469 have just a bit of Javascript in the style another rule will catch
470 that and remove only the Javascript from the style; this catches
471 more sneaky attempts.
472 """
473 style = self._substitute_comments('', style)
474 style = style.replace('\\', '')
475 style = _substitute_whitespace('', style)
476 style = style.lower()
477 if 'javascript:' in style:
478 return True
479 if 'expression(' in style:
480 return True
481 return False
482
491
492 clean = Cleaner()
493 clean_html = clean.clean_html
494
495
496
497
498
499 _link_regexes = [
500 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
501
502 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
503 ]
504
505 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
506
507 _avoid_hosts = [
508 re.compile(r'^localhost', re.I),
509 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
510 re.compile(r'^127\.0\.0\.1$'),
511 ]
512
513 _avoid_classes = ['nolink']
514
519 """
520 Turn any URLs into links.
521
522 It will search for links identified by the given regular
523 expressions (by default mailto and http(s) links).
524
525 It won't link text in an element in avoid_elements, or an element
526 with a class in avoid_classes. It won't link to anything with a
527 host that matches one of the regular expressions in avoid_hosts
528 (default localhost and 127.0.0.1).
529
530 If you pass in an element, the elements tail will not be
531 substituted, only the contents of the element.
532 """
533 if el.tag in avoid_elements:
534 return
535 class_name = el.get('class')
536 if class_name:
537 class_name = class_name.split()
538 for match_class in avoid_classes:
539 if match_class in class_name:
540 return
541 for child in list(el):
542 autolink(child, link_regexes=link_regexes,
543 avoid_elements=avoid_elements,
544 avoid_hosts=avoid_hosts,
545 avoid_classes=avoid_classes)
546 if child.tail:
547 text, tail_children = _link_text(
548 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
549 if tail_children:
550 child.tail = text
551 index = el.index(child)
552 el[index+1:index+1] = tail_children
553 if el.text:
554 text, pre_children = _link_text(
555 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
556 if pre_children:
557 el.text = text
558 el[:0] = pre_children
559
560 -def _link_text(text, link_regexes, avoid_hosts, factory):
561 leading_text = ''
562 links = []
563 last_pos = 0
564 while 1:
565 best_match, best_pos = None, None
566 for regex in link_regexes:
567 regex_pos = last_pos
568 while 1:
569 match = regex.search(text, pos=regex_pos)
570 if match is None:
571 break
572 host = match.group('host')
573 for host_regex in avoid_hosts:
574 if host_regex.search(host):
575 regex_pos = match.end()
576 break
577 else:
578 break
579 if match is None:
580 continue
581 if best_pos is None or match.start() < best_pos:
582 best_match = match
583 best_pos = match.start()
584 if best_match is None:
585
586 if links:
587 assert not links[-1].tail
588 links[-1].tail = text
589 else:
590 assert not leading_text
591 leading_text = text
592 break
593 link = best_match.group(0)
594 end = best_match.end()
595 if link.endswith('.') or link.endswith(','):
596
597 end -= 1
598 link = link[:-1]
599 prev_text = text[:best_match.start()]
600 if links:
601 assert not links[-1].tail
602 links[-1].tail = prev_text
603 else:
604 assert not leading_text
605 leading_text = prev_text
606 anchor = factory('a')
607 anchor.set('href', link)
608 body = best_match.group('body')
609 if not body:
610 body = link
611 if body.endswith('.') or body.endswith(','):
612 body = body[:-1]
613 anchor.text = body
614 links.append(anchor)
615 text = text[end:]
616 return leading_text, links
617
626
627 autolink_html.__doc__ = autolink.__doc__
628
629
630
631
632
633 _avoid_word_break_elements = ['pre', 'textarea', 'code']
634 _avoid_word_break_classes = ['nobreak']
635
640 """
641 Breaks any long words found in the body of the text (not attributes).
642
643 Doesn't effect any of the tags in avoid_elements, by default
644 ``<textarea>`` and ``<pre>``
645
646 Breaks words by inserting ​, which is a unicode character
647 for Zero Width Space character. This generally takes up no space
648 in rendering, but does copy as a space, and in monospace contexts
649 usually takes up space.
650
651 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
652 """
653
654
655 if el.tag in _avoid_word_break_elements:
656 return
657 class_name = el.get('class')
658 if class_name:
659 dont_break = False
660 class_name = class_name.split()
661 for avoid in avoid_classes:
662 if avoid in class_name:
663 dont_break = True
664 break
665 if dont_break:
666 return
667 if el.text:
668 el.text = _break_text(el.text, max_width, break_character)
669 for child in el:
670 word_break(child, max_width=max_width,
671 avoid_elements=avoid_elements,
672 avoid_classes=avoid_classes,
673 break_character=break_character)
674 if child.tail:
675 child.tail = _break_text(child.tail, max_width, break_character)
676
682
683 -def _break_text(text, max_width, break_character):
684 words = text.split()
685 for word in words:
686 if len(word) > max_width:
687 replacement = _insert_break(word, max_width, break_character)
688 text = text.replace(word, replacement)
689 return text
690
691 _break_prefer_re = re.compile(r'[^a-z]', re.I)
692
694 orig_word = word
695 result = ''
696 while len(word) > width:
697 start = word[:width]
698 breaks = list(_break_prefer_re.finditer(start))
699 if breaks:
700 last_break = breaks[-1]
701
702 if last_break.end() > width-10:
703
704
705 start = word[:last_break.end()]
706 result += start + break_character
707 word = word[len(start):]
708 result += word
709 return result
710