1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _javascript_scheme_re = re.compile(
74 r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
75 _substitute_whitespace = re.compile(r'\s+').sub
76
77
78
79 _conditional_comment_re = re.compile(
80 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
81
82 _find_styled_elements = etree.XPath(
83 "descendant-or-self::*[@style]")
84
85 _find_external_links = etree.XPath(
86 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
87 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
88 namespaces={'x':XHTML_NAMESPACE})
89
91 """
92 Instances cleans the document of each of the possible offending
93 elements. The cleaning is controlled by attributes; you can
94 override attributes in a subclass, or set them in the constructor.
95
96 ``scripts``:
97 Removes any ``<script>`` tags.
98
99 ``javascript``:
100 Removes any Javascript, like an ``onclick`` attribute.
101
102 ``comments``:
103 Removes any comments.
104
105 ``style``:
106 Removes any style tags or attributes.
107
108 ``links``:
109 Removes any ``<link>`` tags
110
111 ``meta``:
112 Removes any ``<meta>`` tags
113
114 ``page_structure``:
115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
116
117 ``processing_instructions``:
118 Removes any processing instructions.
119
120 ``embedded``:
121 Removes any embedded objects (flash, iframes)
122
123 ``frames``:
124 Removes any frame-related tags
125
126 ``forms``:
127 Removes any form tags
128
129 ``annoying_tags``:
130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
131
132 ``remove_tags``:
133 A list of tags to remove. Only the tags will be removed,
134 their content will get pulled up into the parent tag.
135
136 ``kill_tags``:
137 A list of tags to kill. Killing also removes the tag's content,
138 i.e. the whole subtree, not just the tag itself.
139
140 ``allow_tags``:
141 A list of tags to include (default include all).
142
143 ``remove_unknown_tags``:
144 Remove any tags that aren't standard parts of HTML.
145
146 ``safe_attrs_only``:
147 If true, only include 'safe' attributes (specifically the list
148 from `feedparser
149 <http://feedparser.org/docs/html-sanitization.html>`_).
150
151 ``add_nofollow``:
152 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
153
154 ``host_whitelist``:
155 A list or set of hosts that you can use for embedded content
156 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
157 You can also implement/override the method
158 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
159 implement more complex rules for what can be embedded.
160 Anything that passes this test will be shown, regardless of
161 the value of (for instance) ``embedded``.
162
163 Note that this parameter might not work as intended if you do not
164 make the links absolute before doing the cleaning.
165
166 ``whitelist_tags``:
167 A set of tags that can be included with ``host_whitelist``.
168 The default is ``iframe`` and ``embed``; you may wish to
169 include other tags like ``script``, or you may want to
170 implement ``allow_embedded_url`` for more control. Set to None to
171 include all tags.
172
173 This modifies the document *in place*.
174 """
175
176 scripts = True
177 javascript = True
178 comments = True
179 style = False
180 links = True
181 meta = True
182 page_structure = True
183 processing_instructions = True
184 embedded = True
185 frames = True
186 forms = True
187 annoying_tags = True
188 remove_tags = None
189 allow_tags = None
190 kill_tags = None
191 remove_unknown_tags = True
192 safe_attrs_only = True
193 add_nofollow = False
194 host_whitelist = ()
195 whitelist_tags = set(['iframe', 'embed'])
196
203
204
205
206 _tag_link_attrs = dict(
207 script='src',
208 link='href',
209
210
211 applet=['code', 'object'],
212 iframe='src',
213 embed='src',
214 layer='src',
215
216
217
218
219
220
221
222
223 a='href',
224 )
225
227 """
228 Cleans the document.
229 """
230 if hasattr(doc, 'getroot'):
231
232 doc = doc.getroot()
233
234 xhtml_to_html(doc)
235
236
237 for el in doc.iter('image'):
238 el.tag = 'img'
239 if not self.comments:
240
241
242 self.kill_conditional_comments(doc)
243
244 kill_tags = set(self.kill_tags or ())
245 remove_tags = set(self.remove_tags or ())
246 allow_tags = set(self.allow_tags or ())
247
248 if self.scripts:
249 kill_tags.add('script')
250 if self.safe_attrs_only:
251 safe_attrs = set(defs.safe_attrs)
252 for el in doc.iter():
253 attrib = el.attrib
254 for aname in attrib.keys():
255 if aname not in safe_attrs:
256 del attrib[aname]
257 if self.javascript:
258 if not self.safe_attrs_only:
259
260 for el in doc.iter():
261 attrib = el.attrib
262 for aname in attrib.keys():
263 if aname.startswith('on'):
264 del attrib[aname]
265 doc.rewrite_links(self._remove_javascript_link,
266 resolve_base_href=False)
267 if not self.style:
268
269
270 for el in _find_styled_elements(doc):
271 old = el.get('style')
272 new = _css_javascript_re.sub('', old)
273 new = _css_import_re.sub('', old)
274 if self._has_sneaky_javascript(new):
275
276 del el.attrib['style']
277 elif new != old:
278 el.set('style', new)
279 for el in list(doc.iter('style')):
280 if el.get('type', '').lower().strip() == 'text/javascript':
281 el.drop_tree()
282 continue
283 old = el.text or ''
284 new = _css_javascript_re.sub('', old)
285
286 new = _css_import_re.sub('', old)
287 if self._has_sneaky_javascript(new):
288
289 el.text = '/* deleted */'
290 elif new != old:
291 el.text = new
292 if self.comments or self.processing_instructions:
293
294
295
296 kill_tags.add(etree.Comment)
297 if self.processing_instructions:
298 kill_tags.add(etree.ProcessingInstruction)
299 if self.style:
300 kill_tags.add('style')
301 etree.strip_attributes(doc, 'style')
302 if self.links:
303 kill_tags.add('link')
304 elif self.style or self.javascript:
305
306
307 for el in list(doc.iter('link')):
308 if 'stylesheet' in el.get('rel', '').lower():
309
310 el.drop_tree()
311 if self.meta:
312 kill_tags.add('meta')
313 if self.page_structure:
314 remove_tags.update(('head', 'html', 'title'))
315 if self.embedded:
316
317
318
319 for el in list(doc.iter('param')):
320 found_parent = False
321 parent = el.getparent()
322 while parent is not None and parent.tag not in ('applet', 'object'):
323 parent = parent.getparent()
324 if parent is None:
325 el.drop_tree()
326 kill_tags.update(('applet',))
327
328 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
329 if self.frames:
330
331
332
333 kill_tags.update(defs.frame_tags)
334 if self.forms:
335 remove_tags.add('form')
336 kill_tags.update(('button', 'input', 'select', 'textarea'))
337 if self.annoying_tags:
338 remove_tags.update(('blink', 'marquee'))
339
340 _remove = []
341 _kill = []
342 for el in doc.iter():
343 if el.tag in kill_tags:
344 if self.allow_element(el):
345 continue
346 _kill.append(el)
347 elif el.tag in remove_tags:
348 if self.allow_element(el):
349 continue
350 _remove.append(el)
351
352 if _remove and _remove[0] == doc:
353
354
355 el = _remove.pop(0)
356 el.tag = 'div'
357 el.attrib.clear()
358 elif _kill and _kill[0] == doc:
359
360
361 el = _kill.pop(0)
362 if el.tag != 'html':
363 el.tag = 'div'
364 el.clear()
365
366 _kill.reverse()
367 for el in _kill:
368 el.drop_tree()
369 for el in _remove:
370 el.drop_tag()
371
372 allow_tags = self.allow_tags
373 if self.remove_unknown_tags:
374 if allow_tags:
375 raise ValueError(
376 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
377 allow_tags = set(defs.tags)
378 if allow_tags:
379 bad = []
380 for el in doc.iter():
381 if el.tag not in allow_tags:
382 bad.append(el)
383 if bad:
384 if bad[0] is doc:
385 el = bad.pop(0)
386 el.tag = 'div'
387 el.attrib.clear()
388 for el in bad:
389 el.drop_tag()
390 if self.add_nofollow:
391 for el in _find_external_links(doc):
392 if not self.allow_follow(el):
393 el.set('rel', 'nofollow')
394
396 """
397 Override to suppress rel="nofollow" on some anchors.
398 """
399 return False
400
402 if el.tag not in self._tag_link_attrs:
403 return False
404 attr = self._tag_link_attrs[el.tag]
405 if isinstance(attr, (list, tuple)):
406 for one_attr in attr:
407 url = el.get(one_attr)
408 if not url:
409 return False
410 if not self.allow_embedded_url(el, url):
411 return False
412 return True
413 else:
414 url = el.get(attr)
415 if not url:
416 return False
417 return self.allow_embedded_url(el, url)
418
420 if (self.whitelist_tags is not None
421 and el.tag not in self.whitelist_tags):
422 return False
423 scheme, netloc, path, query, fragment = urlsplit(url)
424 netloc = netloc.lower().split(':', 1)[0]
425 if scheme not in ('http', 'https'):
426 return False
427 if netloc in self.host_whitelist:
428 return True
429 return False
430
441
443 bad = []
444 for el in doc.iter(iterate):
445 if condition(el):
446 bad.append(el)
447 for el in bad:
448 el.drop_tree()
449
457
458 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
459
461 """
462 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
463 can get interpreted, or ``expre/* stuff */ssion(...)``. This
464 checks for attempt to do stuff like this.
465
466 Typically the response will be to kill the entire style; if you
467 have just a bit of Javascript in the style another rule will catch
468 that and remove only the Javascript from the style; this catches
469 more sneaky attempts.
470 """
471 style = self._substitute_comments('', style)
472 style = style.replace('\\', '')
473 style = _substitute_whitespace('', style)
474 style = style.lower()
475 if 'javascript:' in style:
476 return True
477 if 'expression(' in style:
478 return True
479 return False
480
489
490 clean = Cleaner()
491 clean_html = clean.clean_html
492
493
494
495
496
497 _link_regexes = [
498 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
499
500 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
501 ]
502
503 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
504
505 _avoid_hosts = [
506 re.compile(r'^localhost', re.I),
507 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
508 re.compile(r'^127\.0\.0\.1$'),
509 ]
510
511 _avoid_classes = ['nolink']
512
517 """
518 Turn any URLs into links.
519
520 It will search for links identified by the given regular
521 expressions (by default mailto and http(s) links).
522
523 It won't link text in an element in avoid_elements, or an element
524 with a class in avoid_classes. It won't link to anything with a
525 host that matches one of the regular expressions in avoid_hosts
526 (default localhost and 127.0.0.1).
527
528 If you pass in an element, the element's tail will not be
529 substituted, only the contents of the element.
530 """
531 if el.tag in avoid_elements:
532 return
533 class_name = el.get('class')
534 if class_name:
535 class_name = class_name.split()
536 for match_class in avoid_classes:
537 if match_class in class_name:
538 return
539 for child in list(el):
540 autolink(child, link_regexes=link_regexes,
541 avoid_elements=avoid_elements,
542 avoid_hosts=avoid_hosts,
543 avoid_classes=avoid_classes)
544 if child.tail:
545 text, tail_children = _link_text(
546 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
547 if tail_children:
548 child.tail = text
549 index = el.index(child)
550 el[index+1:index+1] = tail_children
551 if el.text:
552 text, pre_children = _link_text(
553 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
554 if pre_children:
555 el.text = text
556 el[:0] = pre_children
557
558 -def _link_text(text, link_regexes, avoid_hosts, factory):
559 leading_text = ''
560 links = []
561 last_pos = 0
562 while 1:
563 best_match, best_pos = None, None
564 for regex in link_regexes:
565 regex_pos = last_pos
566 while 1:
567 match = regex.search(text, pos=regex_pos)
568 if match is None:
569 break
570 host = match.group('host')
571 for host_regex in avoid_hosts:
572 if host_regex.search(host):
573 regex_pos = match.end()
574 break
575 else:
576 break
577 if match is None:
578 continue
579 if best_pos is None or match.start() < best_pos:
580 best_match = match
581 best_pos = match.start()
582 if best_match is None:
583
584 if links:
585 assert not links[-1].tail
586 links[-1].tail = text
587 else:
588 assert not leading_text
589 leading_text = text
590 break
591 link = best_match.group(0)
592 end = best_match.end()
593 if link.endswith('.') or link.endswith(','):
594
595 end -= 1
596 link = link[:-1]
597 prev_text = text[:best_match.start()]
598 if links:
599 assert not links[-1].tail
600 links[-1].tail = prev_text
601 else:
602 assert not leading_text
603 leading_text = prev_text
604 anchor = factory('a')
605 anchor.set('href', link)
606 body = best_match.group('body')
607 if not body:
608 body = link
609 if body.endswith('.') or body.endswith(','):
610 body = body[:-1]
611 anchor.text = body
612 links.append(anchor)
613 text = text[end:]
614 return leading_text, links
615
624
625 autolink_html.__doc__ = autolink.__doc__
626
627
628
629
630
631 _avoid_word_break_elements = ['pre', 'textarea', 'code']
632 _avoid_word_break_classes = ['nobreak']
633
638 """
639 Breaks any long words found in the body of the text (not attributes).
640
641 Doesn't effect any of the tags in avoid_elements, by default
642 ``<textarea>`` and ``<pre>``
643
644 Breaks words by inserting ​, which is a unicode character
645 for Zero Width Space character. This generally takes up no space
646 in rendering, but does copy as a space, and in monospace contexts
647 usually takes up space.
648
649 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
650 """
651
652
653 if el.tag in _avoid_word_break_elements:
654 return
655 class_name = el.get('class')
656 if class_name:
657 dont_break = False
658 class_name = class_name.split()
659 for avoid in avoid_classes:
660 if avoid in class_name:
661 dont_break = True
662 break
663 if dont_break:
664 return
665 if el.text:
666 el.text = _break_text(el.text, max_width, break_character)
667 for child in el:
668 word_break(child, max_width=max_width,
669 avoid_elements=avoid_elements,
670 avoid_classes=avoid_classes,
671 break_character=break_character)
672 if child.tail:
673 child.tail = _break_text(child.tail, max_width, break_character)
674
680
681 -def _break_text(text, max_width, break_character):
682 words = text.split()
683 for word in words:
684 if len(word) > max_width:
685 replacement = _insert_break(word, max_width, break_character)
686 text = text.replace(word, replacement)
687 return text
688
689 _break_prefer_re = re.compile(r'[^a-z]', re.I)
690
692 orig_word = word
693 result = ''
694 while len(word) > width:
695 start = word[:width]
696 breaks = list(_break_prefer_re.finditer(start))
697 if breaks:
698 last_break = breaks[-1]
699
700 if last_break.end() > width-10:
701
702
703 start = word[:last_break.end()]
704 result += start + break_character
705 word = word[len(start):]
706 result += word
707 return result
708