1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 set
21 except NameError:
22
23 from sets import Set as set
24
25 try:
26 unichr = __builtins__['unichr']
27 except (NameError, KeyError):
28
29 unichr = chr
30
31 try:
32 unicode = __builtins__['unicode']
33 except (NameError, KeyError):
34
35 unicode = str
36
37 try:
38 bytes = __builtins__['bytes']
39 except (NameError, KeyError):
40
41 bytes = str
42
43 try:
44 basestring = __builtins__['basestring']
45 except (NameError, KeyError):
46 basestring = (str, bytes)
47
48
49 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
50 'word_break', 'word_break_html']
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 _css_javascript_re = re.compile(
74 r'expression\s*\(.*?\)', re.S|re.I)
75
76
77 _css_import_re = re.compile(
78 r'@\s*import', re.I)
79
80
81
82 _javascript_scheme_re = re.compile(
83 r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
84 _substitute_whitespace = re.compile(r'\s+').sub
85
86
87
88 _conditional_comment_re = re.compile(
89 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
90
91 _find_styled_elements = etree.XPath(
92 "descendant-or-self::*[@style]")
93
94 _find_external_links = etree.XPath(
95 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
96 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
97 namespaces={'x':XHTML_NAMESPACE})
98
100 """
101 Instances cleans the document of each of the possible offending
102 elements. The cleaning is controlled by attributes; you can
103 override attributes in a subclass, or set them in the constructor.
104
105 ``scripts``:
106 Removes any ``<script>`` tags.
107
108 ``javascript``:
109 Removes any Javascript, like an ``onclick`` attribute.
110
111 ``comments``:
112 Removes any comments.
113
114 ``style``:
115 Removes any style tags or attributes.
116
117 ``links``:
118 Removes any ``<link>`` tags
119
120 ``meta``:
121 Removes any ``<meta>`` tags
122
123 ``page_structure``:
124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
125
126 ``processing_instructions``:
127 Removes any processing instructions.
128
129 ``embedded``:
130 Removes any embedded objects (flash, iframes)
131
132 ``frames``:
133 Removes any frame-related tags
134
135 ``forms``:
136 Removes any form tags
137
138 ``annoying_tags``:
139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
140
141 ``remove_tags``:
142 A list of tags to remove. Only the tags will be removed,
143 their content will get pulled up into the parent tag.
144
145 ``kill_tags``:
146 A list of tags to kill. Killing also removes the tag's content,
147 i.e. the whole subtree, not just the tag itself.
148
149 ``allow_tags``:
150 A list of tags to include (default include all).
151
152 ``remove_unknown_tags``:
153 Remove any tags that aren't standard parts of HTML.
154
155 ``safe_attrs_only``:
156 If true, only include 'safe' attributes (specifically the list
157 from `feedparser
158 <http://feedparser.org/docs/html-sanitization.html>`_).
159
160 ``add_nofollow``:
161 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
162
163 ``host_whitelist``:
164 A list or set of hosts that you can use for embedded content
165 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
166 You can also implement/override the method
167 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
168 implement more complex rules for what can be embedded.
169 Anything that passes this test will be shown, regardless of
170 the value of (for instance) ``embedded``.
171
172 Note that this parameter might not work as intended if you do not
173 make the links absolute before doing the cleaning.
174
175 ``whitelist_tags``:
176 A set of tags that can be included with ``host_whitelist``.
177 The default is ``iframe`` and ``embed``; you may wish to
178 include other tags like ``script``, or you may want to
179 implement ``allow_embedded_url`` for more control. Set to None to
180 include all tags.
181
182 This modifies the document *in place*.
183 """
184
185 scripts = True
186 javascript = True
187 comments = True
188 style = False
189 links = True
190 meta = True
191 page_structure = True
192 processing_instructions = True
193 embedded = True
194 frames = True
195 forms = True
196 annoying_tags = True
197 remove_tags = None
198 allow_tags = None
199 kill_tags = None
200 remove_unknown_tags = True
201 safe_attrs_only = True
202 add_nofollow = False
203 host_whitelist = ()
204 whitelist_tags = set(['iframe', 'embed'])
205
212
213
214
215 _tag_link_attrs = dict(
216 script='src',
217 link='href',
218
219
220 applet=['code', 'object'],
221 iframe='src',
222 embed='src',
223 layer='src',
224
225
226
227
228
229
230
231
232 a='href',
233 )
234
236 """
237 Cleans the document.
238 """
239 if hasattr(doc, 'getroot'):
240
241 doc = doc.getroot()
242
243 xhtml_to_html(doc)
244
245
246 for el in doc.iter('image'):
247 el.tag = 'img'
248 if not self.comments:
249
250
251 self.kill_conditional_comments(doc)
252
253 kill_tags = set(self.kill_tags or ())
254 remove_tags = set(self.remove_tags or ())
255 allow_tags = set(self.allow_tags or ())
256
257 if self.scripts:
258 kill_tags.add('script')
259 if self.safe_attrs_only:
260 safe_attrs = set(defs.safe_attrs)
261 for el in doc.iter():
262 attrib = el.attrib
263 for aname in attrib.keys():
264 if aname not in safe_attrs:
265 del attrib[aname]
266 if self.javascript:
267 if not self.safe_attrs_only:
268
269 for el in doc.iter():
270 attrib = el.attrib
271 for aname in attrib.keys():
272 if aname.startswith('on'):
273 del attrib[aname]
274 doc.rewrite_links(self._remove_javascript_link,
275 resolve_base_href=False)
276 if not self.style:
277
278
279 for el in _find_styled_elements(doc):
280 old = el.get('style')
281 new = _css_javascript_re.sub('', old)
282 new = _css_import_re.sub('', old)
283 if self._has_sneaky_javascript(new):
284
285 del el.attrib['style']
286 elif new != old:
287 el.set('style', new)
288 for el in list(doc.iter('style')):
289 if el.get('type', '').lower().strip() == 'text/javascript':
290 el.drop_tree()
291 continue
292 old = el.text or ''
293 new = _css_javascript_re.sub('', old)
294
295 new = _css_import_re.sub('', old)
296 if self._has_sneaky_javascript(new):
297
298 el.text = '/* deleted */'
299 elif new != old:
300 el.text = new
301 if self.comments or self.processing_instructions:
302
303
304
305 kill_tags.add(etree.Comment)
306 if self.processing_instructions:
307 kill_tags.add(etree.ProcessingInstruction)
308 if self.style:
309 kill_tags.add('style')
310 etree.strip_attributes(doc, 'style')
311 if self.links:
312 kill_tags.add('link')
313 elif self.style or self.javascript:
314
315
316 for el in list(doc.iter('link')):
317 if 'stylesheet' in el.get('rel', '').lower():
318
319 el.drop_tree()
320 if self.meta:
321 kill_tags.add('meta')
322 if self.page_structure:
323 remove_tags.update(('head', 'html', 'title'))
324 if self.embedded:
325
326
327
328 for el in list(doc.iter('param')):
329 found_parent = False
330 parent = el.getparent()
331 while parent is not None and parent.tag not in ('applet', 'object'):
332 parent = parent.getparent()
333 if parent is None:
334 el.drop_tree()
335 kill_tags.update(('applet',))
336
337 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
338 if self.frames:
339
340
341
342 kill_tags.update(defs.frame_tags)
343 if self.forms:
344 remove_tags.add('form')
345 kill_tags.update(('button', 'input', 'select', 'textarea'))
346 if self.annoying_tags:
347 remove_tags.update(('blink', 'marquee'))
348
349 _remove = []
350 _kill = []
351 for el in doc.iter():
352 if el.tag in kill_tags:
353 if self.allow_element(el):
354 continue
355 _kill.append(el)
356 elif el.tag in remove_tags:
357 if self.allow_element(el):
358 continue
359 _remove.append(el)
360
361 if _remove and _remove[0] == doc:
362
363
364 el = _remove.pop(0)
365 el.tag = 'div'
366 el.attrib.clear()
367 elif _kill and _kill[0] == doc:
368
369
370 el = _kill.pop(0)
371 if el.tag != 'html':
372 el.tag = 'div'
373 el.clear()
374
375 _kill.reverse()
376 for el in _kill:
377 el.drop_tree()
378 for el in _remove:
379 el.drop_tag()
380
381 allow_tags = self.allow_tags
382 if self.remove_unknown_tags:
383 if allow_tags:
384 raise ValueError(
385 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
386 allow_tags = set(defs.tags)
387 if allow_tags:
388 bad = []
389 for el in doc.iter():
390 if el.tag not in allow_tags:
391 bad.append(el)
392 if bad:
393 if bad[0] is doc:
394 el = bad.pop(0)
395 el.tag = 'div'
396 el.attrib.clear()
397 for el in bad:
398 el.drop_tag()
399 if self.add_nofollow:
400 for el in _find_external_links(doc):
401 if not self.allow_follow(el):
402 el.set('rel', 'nofollow')
403
405 """
406 Override to suppress rel="nofollow" on some anchors.
407 """
408 return False
409
411 if el.tag not in self._tag_link_attrs:
412 return False
413 attr = self._tag_link_attrs[el.tag]
414 if isinstance(attr, (list, tuple)):
415 for one_attr in attr:
416 url = el.get(one_attr)
417 if not url:
418 return False
419 if not self.allow_embedded_url(el, url):
420 return False
421 return True
422 else:
423 url = el.get(attr)
424 if not url:
425 return False
426 return self.allow_embedded_url(el, url)
427
429 if (self.whitelist_tags is not None
430 and el.tag not in self.whitelist_tags):
431 return False
432 scheme, netloc, path, query, fragment = urlsplit(url)
433 netloc = netloc.lower().split(':', 1)[0]
434 if scheme not in ('http', 'https'):
435 return False
436 if netloc in self.host_whitelist:
437 return True
438 return False
439
450
452 bad = []
453 for el in doc.iter(iterate):
454 if condition(el):
455 bad.append(el)
456 for el in bad:
457 el.drop_tree()
458
466
467 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
468
470 """
471 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
472 can get interpreted, or ``expre/* stuff */ssion(...)``. This
473 checks for attempt to do stuff like this.
474
475 Typically the response will be to kill the entire style; if you
476 have just a bit of Javascript in the style another rule will catch
477 that and remove only the Javascript from the style; this catches
478 more sneaky attempts.
479 """
480 style = self._substitute_comments('', style)
481 style = style.replace('\\', '')
482 style = _substitute_whitespace('', style)
483 style = style.lower()
484 if 'javascript:' in style:
485 return True
486 if 'expression(' in style:
487 return True
488 return False
489
498
499 clean = Cleaner()
500 clean_html = clean.clean_html
501
502
503
504
505
506 _link_regexes = [
507 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
508
509 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
510 ]
511
512 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
513
514 _avoid_hosts = [
515 re.compile(r'^localhost', re.I),
516 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
517 re.compile(r'^127\.0\.0\.1$'),
518 ]
519
520 _avoid_classes = ['nolink']
521
526 """
527 Turn any URLs into links.
528
529 It will search for links identified by the given regular
530 expressions (by default mailto and http(s) links).
531
532 It won't link text in an element in avoid_elements, or an element
533 with a class in avoid_classes. It won't link to anything with a
534 host that matches one of the regular expressions in avoid_hosts
535 (default localhost and 127.0.0.1).
536
537 If you pass in an element, the element's tail will not be
538 substituted, only the contents of the element.
539 """
540 if el.tag in avoid_elements:
541 return
542 class_name = el.get('class')
543 if class_name:
544 class_name = class_name.split()
545 for match_class in avoid_classes:
546 if match_class in class_name:
547 return
548 for child in list(el):
549 autolink(child, link_regexes=link_regexes,
550 avoid_elements=avoid_elements,
551 avoid_hosts=avoid_hosts,
552 avoid_classes=avoid_classes)
553 if child.tail:
554 text, tail_children = _link_text(
555 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
556 if tail_children:
557 child.tail = text
558 index = el.index(child)
559 el[index+1:index+1] = tail_children
560 if el.text:
561 text, pre_children = _link_text(
562 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
563 if pre_children:
564 el.text = text
565 el[:0] = pre_children
566
567 -def _link_text(text, link_regexes, avoid_hosts, factory):
568 leading_text = ''
569 links = []
570 last_pos = 0
571 while 1:
572 best_match, best_pos = None, None
573 for regex in link_regexes:
574 regex_pos = last_pos
575 while 1:
576 match = regex.search(text, pos=regex_pos)
577 if match is None:
578 break
579 host = match.group('host')
580 for host_regex in avoid_hosts:
581 if host_regex.search(host):
582 regex_pos = match.end()
583 break
584 else:
585 break
586 if match is None:
587 continue
588 if best_pos is None or match.start() < best_pos:
589 best_match = match
590 best_pos = match.start()
591 if best_match is None:
592
593 if links:
594 assert not links[-1].tail
595 links[-1].tail = text
596 else:
597 assert not leading_text
598 leading_text = text
599 break
600 link = best_match.group(0)
601 end = best_match.end()
602 if link.endswith('.') or link.endswith(','):
603
604 end -= 1
605 link = link[:-1]
606 prev_text = text[:best_match.start()]
607 if links:
608 assert not links[-1].tail
609 links[-1].tail = prev_text
610 else:
611 assert not leading_text
612 leading_text = prev_text
613 anchor = factory('a')
614 anchor.set('href', link)
615 body = best_match.group('body')
616 if not body:
617 body = link
618 if body.endswith('.') or body.endswith(','):
619 body = body[:-1]
620 anchor.text = body
621 links.append(anchor)
622 text = text[end:]
623 return leading_text, links
624
633
634 autolink_html.__doc__ = autolink.__doc__
635
636
637
638
639
640 _avoid_word_break_elements = ['pre', 'textarea', 'code']
641 _avoid_word_break_classes = ['nobreak']
642
647 """
648 Breaks any long words found in the body of the text (not attributes).
649
650 Doesn't effect any of the tags in avoid_elements, by default
651 ``<textarea>`` and ``<pre>``
652
653 Breaks words by inserting ​, which is a unicode character
654 for Zero Width Space character. This generally takes up no space
655 in rendering, but does copy as a space, and in monospace contexts
656 usually takes up space.
657
658 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
659 """
660
661
662 if el.tag in _avoid_word_break_elements:
663 return
664 class_name = el.get('class')
665 if class_name:
666 dont_break = False
667 class_name = class_name.split()
668 for avoid in avoid_classes:
669 if avoid in class_name:
670 dont_break = True
671 break
672 if dont_break:
673 return
674 if el.text:
675 el.text = _break_text(el.text, max_width, break_character)
676 for child in el:
677 word_break(child, max_width=max_width,
678 avoid_elements=avoid_elements,
679 avoid_classes=avoid_classes,
680 break_character=break_character)
681 if child.tail:
682 child.tail = _break_text(child.tail, max_width, break_character)
683
689
690 -def _break_text(text, max_width, break_character):
691 words = text.split()
692 for word in words:
693 if len(word) > max_width:
694 replacement = _insert_break(word, max_width, break_character)
695 text = text.replace(word, replacement)
696 return text
697
698 _break_prefer_re = re.compile(r'[^a-z]', re.I)
699
701 orig_word = word
702 result = ''
703 while len(word) > width:
704 start = word[:width]
705 breaks = list(_break_prefer_re.finditer(start))
706 if breaks:
707 last_break = breaks[-1]
708
709 if last_break.end() > width-10:
710
711
712 start = word[:last_break.end()]
713 result += start + break_character
714 word = word[len(start):]
715 result += word
716 return result
717