1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _is_javascript_scheme = re.compile(
74 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
75 re.I).search
76 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
77
78
79
80 _conditional_comment_re = re.compile(
81 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
82
83 _find_styled_elements = etree.XPath(
84 "descendant-or-self::*[@style]")
85
86 _find_external_links = etree.XPath(
87 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
88 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
89 namespaces={'x':XHTML_NAMESPACE})
90
92 """
93 Instances cleans the document of each of the possible offending
94 elements. The cleaning is controlled by attributes; you can
95 override attributes in a subclass, or set them in the constructor.
96
97 ``scripts``:
98 Removes any ``<script>`` tags.
99
100 ``javascript``:
101 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
102 as they could contain Javascript.
103
104 ``comments``:
105 Removes any comments.
106
107 ``style``:
108 Removes any style tags or attributes.
109
110 ``links``:
111 Removes any ``<link>`` tags
112
113 ``meta``:
114 Removes any ``<meta>`` tags
115
116 ``page_structure``:
117 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
118
119 ``processing_instructions``:
120 Removes any processing instructions.
121
122 ``embedded``:
123 Removes any embedded objects (flash, iframes)
124
125 ``frames``:
126 Removes any frame-related tags
127
128 ``forms``:
129 Removes any form tags
130
131 ``annoying_tags``:
132 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
133
134 ``remove_tags``:
135 A list of tags to remove. Only the tags will be removed,
136 their content will get pulled up into the parent tag.
137
138 ``kill_tags``:
139 A list of tags to kill. Killing also removes the tag's content,
140 i.e. the whole subtree, not just the tag itself.
141
142 ``allow_tags``:
143 A list of tags to include (default include all).
144
145 ``remove_unknown_tags``:
146 Remove any tags that aren't standard parts of HTML.
147
148 ``safe_attrs_only``:
149 If true, only include 'safe' attributes (specifically the list
150 from the feedparser HTML sanitisation web site).
151
152 ``safe_attrs``:
153 A set of attribute names to override the default list of attributes
154 considered 'safe' (when safe_attrs_only=True).
155
156 ``add_nofollow``:
157 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
158
159 ``host_whitelist``:
160 A list or set of hosts that you can use for embedded content
161 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
162 You can also implement/override the method
163 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
164 implement more complex rules for what can be embedded.
165 Anything that passes this test will be shown, regardless of
166 the value of (for instance) ``embedded``.
167
168 Note that this parameter might not work as intended if you do not
169 make the links absolute before doing the cleaning.
170
171 Note that you may also need to set ``whitelist_tags``.
172
173 ``whitelist_tags``:
174 A set of tags that can be included with ``host_whitelist``.
175 The default is ``iframe`` and ``embed``; you may wish to
176 include other tags like ``script``, or you may want to
177 implement ``allow_embedded_url`` for more control. Set to None to
178 include all tags.
179
180 This modifies the document *in place*.
181 """
182
183 scripts = True
184 javascript = True
185 comments = True
186 style = False
187 links = True
188 meta = True
189 page_structure = True
190 processing_instructions = True
191 embedded = True
192 frames = True
193 forms = True
194 annoying_tags = True
195 remove_tags = None
196 allow_tags = None
197 kill_tags = None
198 remove_unknown_tags = True
199 safe_attrs_only = True
200 safe_attrs = defs.safe_attrs
201 add_nofollow = False
202 host_whitelist = ()
203 whitelist_tags = set(['iframe', 'embed'])
204
211
212
213
214 _tag_link_attrs = dict(
215 script='src',
216 link='href',
217
218
219 applet=['code', 'object'],
220 iframe='src',
221 embed='src',
222 layer='src',
223
224
225
226
227
228
229
230
231 a='href',
232 )
233
235 """
236 Cleans the document.
237 """
238 if hasattr(doc, 'getroot'):
239
240 doc = doc.getroot()
241
242 xhtml_to_html(doc)
243
244
245 for el in doc.iter('image'):
246 el.tag = 'img'
247 if not self.comments:
248
249
250 self.kill_conditional_comments(doc)
251
252 kill_tags = set(self.kill_tags or ())
253 remove_tags = set(self.remove_tags or ())
254 allow_tags = set(self.allow_tags or ())
255
256 if self.scripts:
257 kill_tags.add('script')
258 if self.safe_attrs_only:
259 safe_attrs = set(self.safe_attrs)
260 for el in doc.iter(etree.Element):
261 attrib = el.attrib
262 for aname in attrib.keys():
263 if aname not in safe_attrs:
264 del attrib[aname]
265 if self.javascript:
266 if not (self.safe_attrs_only and
267 self.safe_attrs == defs.safe_attrs):
268
269 for el in doc.iter(etree.Element):
270 attrib = el.attrib
271 for aname in attrib.keys():
272 if aname.startswith('on'):
273 del attrib[aname]
274 doc.rewrite_links(self._remove_javascript_link,
275 resolve_base_href=False)
276 if not self.style:
277
278
279 for el in _find_styled_elements(doc):
280 old = el.get('style')
281 new = _css_javascript_re.sub('', old)
282 new = _css_import_re.sub('', new)
283 if self._has_sneaky_javascript(new):
284
285 del el.attrib['style']
286 elif new != old:
287 el.set('style', new)
288 for el in list(doc.iter('style')):
289 if el.get('type', '').lower().strip() == 'text/javascript':
290 el.drop_tree()
291 continue
292 old = el.text or ''
293 new = _css_javascript_re.sub('', old)
294
295 new = _css_import_re.sub('', old)
296 if self._has_sneaky_javascript(new):
297
298 el.text = '/* deleted */'
299 elif new != old:
300 el.text = new
301 if self.comments or self.processing_instructions:
302
303
304
305 kill_tags.add(etree.Comment)
306 if self.processing_instructions:
307 kill_tags.add(etree.ProcessingInstruction)
308 if self.style:
309 kill_tags.add('style')
310 etree.strip_attributes(doc, 'style')
311 if self.links:
312 kill_tags.add('link')
313 elif self.style or self.javascript:
314
315
316 for el in list(doc.iter('link')):
317 if 'stylesheet' in el.get('rel', '').lower():
318
319 if not self.allow_element(el):
320 el.drop_tree()
321 if self.meta:
322 kill_tags.add('meta')
323 if self.page_structure:
324 remove_tags.update(('head', 'html', 'title'))
325 if self.embedded:
326
327
328
329 for el in list(doc.iter('param')):
330 found_parent = False
331 parent = el.getparent()
332 while parent is not None and parent.tag not in ('applet', 'object'):
333 parent = parent.getparent()
334 if parent is None:
335 el.drop_tree()
336 kill_tags.update(('applet',))
337
338 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
339 if self.frames:
340
341
342
343 kill_tags.update(defs.frame_tags)
344 if self.forms:
345 remove_tags.add('form')
346 kill_tags.update(('button', 'input', 'select', 'textarea'))
347 if self.annoying_tags:
348 remove_tags.update(('blink', 'marquee'))
349
350 _remove = []
351 _kill = []
352 for el in doc.iter():
353 if el.tag in kill_tags:
354 if self.allow_element(el):
355 continue
356 _kill.append(el)
357 elif el.tag in remove_tags:
358 if self.allow_element(el):
359 continue
360 _remove.append(el)
361
362 if _remove and _remove[0] == doc:
363
364
365 el = _remove.pop(0)
366 el.tag = 'div'
367 el.attrib.clear()
368 elif _kill and _kill[0] == doc:
369
370
371 el = _kill.pop(0)
372 if el.tag != 'html':
373 el.tag = 'div'
374 el.clear()
375
376 _kill.reverse()
377 for el in _kill:
378 el.drop_tree()
379 for el in _remove:
380 el.drop_tag()
381
382 if self.remove_unknown_tags:
383 if allow_tags:
384 raise ValueError(
385 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
386 allow_tags = set(defs.tags)
387 if allow_tags:
388 bad = []
389 for el in doc.iter():
390 if el.tag not in allow_tags:
391 bad.append(el)
392 if bad:
393 if bad[0] is doc:
394 el = bad.pop(0)
395 el.tag = 'div'
396 el.attrib.clear()
397 for el in bad:
398 el.drop_tag()
399 if self.add_nofollow:
400 for el in _find_external_links(doc):
401 if not self.allow_follow(el):
402 rel = el.get('rel')
403 if rel:
404 if ('nofollow' in rel
405 and ' nofollow ' in (' %s ' % rel)):
406 continue
407 rel = '%s nofollow' % rel
408 else:
409 rel = 'nofollow'
410 el.set('rel', rel)
411
413 """
414 Override to suppress rel="nofollow" on some anchors.
415 """
416 return False
417
419 if el.tag not in self._tag_link_attrs:
420 return False
421 attr = self._tag_link_attrs[el.tag]
422 if isinstance(attr, (list, tuple)):
423 for one_attr in attr:
424 url = el.get(one_attr)
425 if not url:
426 return False
427 if not self.allow_embedded_url(el, url):
428 return False
429 return True
430 else:
431 url = el.get(attr)
432 if not url:
433 return False
434 return self.allow_embedded_url(el, url)
435
437 if (self.whitelist_tags is not None
438 and el.tag not in self.whitelist_tags):
439 return False
440 scheme, netloc, path, query, fragment = urlsplit(url)
441 netloc = netloc.lower().split(':', 1)[0]
442 if scheme not in ('http', 'https'):
443 return False
444 if netloc in self.host_whitelist:
445 return True
446 return False
447
458
460 bad = []
461 for el in doc.iter(iterate):
462 if condition(el):
463 bad.append(el)
464 for el in bad:
465 el.drop_tree()
466
474
475 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
476
478 """
479 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
480 can get interpreted, or ``expre/* stuff */ssion(...)``. This
481 checks for attempt to do stuff like this.
482
483 Typically the response will be to kill the entire style; if you
484 have just a bit of Javascript in the style another rule will catch
485 that and remove only the Javascript from the style; this catches
486 more sneaky attempts.
487 """
488 style = self._substitute_comments('', style)
489 style = style.replace('\\', '')
490 style = _substitute_whitespace('', style)
491 style = style.lower()
492 if 'javascript:' in style:
493 return True
494 if 'expression(' in style:
495 return True
496 return False
497
506
507 clean = Cleaner()
508 clean_html = clean.clean_html
509
510
511
512
513
514 _link_regexes = [
515 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
516
517 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
518 ]
519
520 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
521
522 _avoid_hosts = [
523 re.compile(r'^localhost', re.I),
524 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
525 re.compile(r'^127\.0\.0\.1$'),
526 ]
527
528 _avoid_classes = ['nolink']
529
534 """
535 Turn any URLs into links.
536
537 It will search for links identified by the given regular
538 expressions (by default mailto and http(s) links).
539
540 It won't link text in an element in avoid_elements, or an element
541 with a class in avoid_classes. It won't link to anything with a
542 host that matches one of the regular expressions in avoid_hosts
543 (default localhost and 127.0.0.1).
544
545 If you pass in an element, the element's tail will not be
546 substituted, only the contents of the element.
547 """
548 if el.tag in avoid_elements:
549 return
550 class_name = el.get('class')
551 if class_name:
552 class_name = class_name.split()
553 for match_class in avoid_classes:
554 if match_class in class_name:
555 return
556 for child in list(el):
557 autolink(child, link_regexes=link_regexes,
558 avoid_elements=avoid_elements,
559 avoid_hosts=avoid_hosts,
560 avoid_classes=avoid_classes)
561 if child.tail:
562 text, tail_children = _link_text(
563 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
564 if tail_children:
565 child.tail = text
566 index = el.index(child)
567 el[index+1:index+1] = tail_children
568 if el.text:
569 text, pre_children = _link_text(
570 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
571 if pre_children:
572 el.text = text
573 el[:0] = pre_children
574
575 -def _link_text(text, link_regexes, avoid_hosts, factory):
576 leading_text = ''
577 links = []
578 last_pos = 0
579 while 1:
580 best_match, best_pos = None, None
581 for regex in link_regexes:
582 regex_pos = last_pos
583 while 1:
584 match = regex.search(text, pos=regex_pos)
585 if match is None:
586 break
587 host = match.group('host')
588 for host_regex in avoid_hosts:
589 if host_regex.search(host):
590 regex_pos = match.end()
591 break
592 else:
593 break
594 if match is None:
595 continue
596 if best_pos is None or match.start() < best_pos:
597 best_match = match
598 best_pos = match.start()
599 if best_match is None:
600
601 if links:
602 assert not links[-1].tail
603 links[-1].tail = text
604 else:
605 assert not leading_text
606 leading_text = text
607 break
608 link = best_match.group(0)
609 end = best_match.end()
610 if link.endswith('.') or link.endswith(','):
611
612 end -= 1
613 link = link[:-1]
614 prev_text = text[:best_match.start()]
615 if links:
616 assert not links[-1].tail
617 links[-1].tail = prev_text
618 else:
619 assert not leading_text
620 leading_text = prev_text
621 anchor = factory('a')
622 anchor.set('href', link)
623 body = best_match.group('body')
624 if not body:
625 body = link
626 if body.endswith('.') or body.endswith(','):
627 body = body[:-1]
628 anchor.text = body
629 links.append(anchor)
630 text = text[end:]
631 return leading_text, links
632
641
642 autolink_html.__doc__ = autolink.__doc__
643
644
645
646
647
648 _avoid_word_break_elements = ['pre', 'textarea', 'code']
649 _avoid_word_break_classes = ['nobreak']
650
655 """
656 Breaks any long words found in the body of the text (not attributes).
657
658 Doesn't effect any of the tags in avoid_elements, by default
659 ``<textarea>`` and ``<pre>``
660
661 Breaks words by inserting ​, which is a unicode character
662 for Zero Width Space character. This generally takes up no space
663 in rendering, but does copy as a space, and in monospace contexts
664 usually takes up space.
665
666 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
667 """
668
669
670 if el.tag in _avoid_word_break_elements:
671 return
672 class_name = el.get('class')
673 if class_name:
674 dont_break = False
675 class_name = class_name.split()
676 for avoid in avoid_classes:
677 if avoid in class_name:
678 dont_break = True
679 break
680 if dont_break:
681 return
682 if el.text:
683 el.text = _break_text(el.text, max_width, break_character)
684 for child in el:
685 word_break(child, max_width=max_width,
686 avoid_elements=avoid_elements,
687 avoid_classes=avoid_classes,
688 break_character=break_character)
689 if child.tail:
690 child.tail = _break_text(child.tail, max_width, break_character)
691
697
698 -def _break_text(text, max_width, break_character):
699 words = text.split()
700 for word in words:
701 if len(word) > max_width:
702 replacement = _insert_break(word, max_width, break_character)
703 text = text.replace(word, replacement)
704 return text
705
706 _break_prefer_re = re.compile(r'[^a-z]', re.I)
707
709 orig_word = word
710 result = ''
711 while len(word) > width:
712 start = word[:width]
713 breaks = list(_break_prefer_re.finditer(start))
714 if breaks:
715 last_break = breaks[-1]
716
717 if last_break.end() > width-10:
718
719
720 start = word[:last_break.end()]
721 result += start + break_character
722 word = word[len(start):]
723 result += word
724 return result
725