1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _javascript_scheme_re = re.compile(
74 r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
75 _substitute_whitespace = re.compile(r'\s+').sub
76
77
78
79 _conditional_comment_re = re.compile(
80 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
81
82 _find_styled_elements = etree.XPath(
83 "descendant-or-self::*[@style]")
84
85 _find_external_links = etree.XPath(
86 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
87 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
88 namespaces={'x':XHTML_NAMESPACE})
89
91 """
92 Instances cleans the document of each of the possible offending
93 elements. The cleaning is controlled by attributes; you can
94 override attributes in a subclass, or set them in the constructor.
95
96 ``scripts``:
97 Removes any ``<script>`` tags.
98
99 ``javascript``:
100 Removes any Javascript, like an ``onclick`` attribute.
101
102 ``comments``:
103 Removes any comments.
104
105 ``style``:
106 Removes any style tags or attributes.
107
108 ``links``:
109 Removes any ``<link>`` tags
110
111 ``meta``:
112 Removes any ``<meta>`` tags
113
114 ``page_structure``:
115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
116
117 ``processing_instructions``:
118 Removes any processing instructions.
119
120 ``embedded``:
121 Removes any embedded objects (flash, iframes)
122
123 ``frames``:
124 Removes any frame-related tags
125
126 ``forms``:
127 Removes any form tags
128
129 ``annoying_tags``:
130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
131
132 ``remove_tags``:
133 A list of tags to remove. Only the tags will be removed,
134 their content will get pulled up into the parent tag.
135
136 ``kill_tags``:
137 A list of tags to kill. Killing also removes the tag's content,
138 i.e. the whole subtree, not just the tag itself.
139
140 ``allow_tags``:
141 A list of tags to include (default include all).
142
143 ``remove_unknown_tags``:
144 Remove any tags that aren't standard parts of HTML.
145
146 ``safe_attrs_only``:
147 If true, only include 'safe' attributes (specifically the list
148 from the feedparser HTML sanitisation web site).
149
150 ``safe_attrs``:
151 A set of attribute names to override the default list of attributes
152 considered 'safe' (when safe_attrs_only=True).
153
154 ``add_nofollow``:
155 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
156
157 ``host_whitelist``:
158 A list or set of hosts that you can use for embedded content
159 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
160 You can also implement/override the method
161 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
162 implement more complex rules for what can be embedded.
163 Anything that passes this test will be shown, regardless of
164 the value of (for instance) ``embedded``.
165
166 Note that this parameter might not work as intended if you do not
167 make the links absolute before doing the cleaning.
168
169 ``whitelist_tags``:
170 A set of tags that can be included with ``host_whitelist``.
171 The default is ``iframe`` and ``embed``; you may wish to
172 include other tags like ``script``, or you may want to
173 implement ``allow_embedded_url`` for more control. Set to None to
174 include all tags.
175
176 This modifies the document *in place*.
177 """
178
179 scripts = True
180 javascript = True
181 comments = True
182 style = False
183 links = True
184 meta = True
185 page_structure = True
186 processing_instructions = True
187 embedded = True
188 frames = True
189 forms = True
190 annoying_tags = True
191 remove_tags = None
192 allow_tags = None
193 kill_tags = None
194 remove_unknown_tags = True
195 safe_attrs_only = True
196 safe_attrs = defs.safe_attrs
197 add_nofollow = False
198 host_whitelist = ()
199 whitelist_tags = set(['iframe', 'embed'])
200
207
208
209
210 _tag_link_attrs = dict(
211 script='src',
212 link='href',
213
214
215 applet=['code', 'object'],
216 iframe='src',
217 embed='src',
218 layer='src',
219
220
221
222
223
224
225
226
227 a='href',
228 )
229
231 """
232 Cleans the document.
233 """
234 if hasattr(doc, 'getroot'):
235
236 doc = doc.getroot()
237
238 xhtml_to_html(doc)
239
240
241 for el in doc.iter('image'):
242 el.tag = 'img'
243 if not self.comments:
244
245
246 self.kill_conditional_comments(doc)
247
248 kill_tags = set(self.kill_tags or ())
249 remove_tags = set(self.remove_tags or ())
250 allow_tags = set(self.allow_tags or ())
251
252 if self.scripts:
253 kill_tags.add('script')
254 if self.safe_attrs_only:
255 safe_attrs = set(self.safe_attrs)
256 for el in doc.iter():
257 attrib = el.attrib
258 for aname in attrib.keys():
259 if aname not in safe_attrs:
260 del attrib[aname]
261 if self.javascript:
262 if not (self.safe_attrs_only and
263 self.safe_attrs == defs.safe_attrs):
264
265 for el in doc.iter():
266 attrib = el.attrib
267 for aname in attrib.keys():
268 if aname.startswith('on'):
269 del attrib[aname]
270 doc.rewrite_links(self._remove_javascript_link,
271 resolve_base_href=False)
272 if not self.style:
273
274
275 for el in _find_styled_elements(doc):
276 old = el.get('style')
277 new = _css_javascript_re.sub('', old)
278 new = _css_import_re.sub('', old)
279 if self._has_sneaky_javascript(new):
280
281 del el.attrib['style']
282 elif new != old:
283 el.set('style', new)
284 for el in list(doc.iter('style')):
285 if el.get('type', '').lower().strip() == 'text/javascript':
286 el.drop_tree()
287 continue
288 old = el.text or ''
289 new = _css_javascript_re.sub('', old)
290
291 new = _css_import_re.sub('', old)
292 if self._has_sneaky_javascript(new):
293
294 el.text = '/* deleted */'
295 elif new != old:
296 el.text = new
297 if self.comments or self.processing_instructions:
298
299
300
301 kill_tags.add(etree.Comment)
302 if self.processing_instructions:
303 kill_tags.add(etree.ProcessingInstruction)
304 if self.style:
305 kill_tags.add('style')
306 etree.strip_attributes(doc, 'style')
307 if self.links:
308 kill_tags.add('link')
309 elif self.style or self.javascript:
310
311
312 for el in list(doc.iter('link')):
313 if 'stylesheet' in el.get('rel', '').lower():
314
315 el.drop_tree()
316 if self.meta:
317 kill_tags.add('meta')
318 if self.page_structure:
319 remove_tags.update(('head', 'html', 'title'))
320 if self.embedded:
321
322
323
324 for el in list(doc.iter('param')):
325 found_parent = False
326 parent = el.getparent()
327 while parent is not None and parent.tag not in ('applet', 'object'):
328 parent = parent.getparent()
329 if parent is None:
330 el.drop_tree()
331 kill_tags.update(('applet',))
332
333 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
334 if self.frames:
335
336
337
338 kill_tags.update(defs.frame_tags)
339 if self.forms:
340 remove_tags.add('form')
341 kill_tags.update(('button', 'input', 'select', 'textarea'))
342 if self.annoying_tags:
343 remove_tags.update(('blink', 'marquee'))
344
345 _remove = []
346 _kill = []
347 for el in doc.iter():
348 if el.tag in kill_tags:
349 if self.allow_element(el):
350 continue
351 _kill.append(el)
352 elif el.tag in remove_tags:
353 if self.allow_element(el):
354 continue
355 _remove.append(el)
356
357 if _remove and _remove[0] == doc:
358
359
360 el = _remove.pop(0)
361 el.tag = 'div'
362 el.attrib.clear()
363 elif _kill and _kill[0] == doc:
364
365
366 el = _kill.pop(0)
367 if el.tag != 'html':
368 el.tag = 'div'
369 el.clear()
370
371 _kill.reverse()
372 for el in _kill:
373 el.drop_tree()
374 for el in _remove:
375 el.drop_tag()
376
377 allow_tags = self.allow_tags
378 if self.remove_unknown_tags:
379 if allow_tags:
380 raise ValueError(
381 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
382 allow_tags = set(defs.tags)
383 if allow_tags:
384 bad = []
385 for el in doc.iter():
386 if el.tag not in allow_tags:
387 bad.append(el)
388 if bad:
389 if bad[0] is doc:
390 el = bad.pop(0)
391 el.tag = 'div'
392 el.attrib.clear()
393 for el in bad:
394 el.drop_tag()
395 if self.add_nofollow:
396 for el in _find_external_links(doc):
397 if not self.allow_follow(el):
398 el.set('rel', 'nofollow')
399
401 """
402 Override to suppress rel="nofollow" on some anchors.
403 """
404 return False
405
407 if el.tag not in self._tag_link_attrs:
408 return False
409 attr = self._tag_link_attrs[el.tag]
410 if isinstance(attr, (list, tuple)):
411 for one_attr in attr:
412 url = el.get(one_attr)
413 if not url:
414 return False
415 if not self.allow_embedded_url(el, url):
416 return False
417 return True
418 else:
419 url = el.get(attr)
420 if not url:
421 return False
422 return self.allow_embedded_url(el, url)
423
425 if (self.whitelist_tags is not None
426 and el.tag not in self.whitelist_tags):
427 return False
428 scheme, netloc, path, query, fragment = urlsplit(url)
429 netloc = netloc.lower().split(':', 1)[0]
430 if scheme not in ('http', 'https'):
431 return False
432 if netloc in self.host_whitelist:
433 return True
434 return False
435
446
448 bad = []
449 for el in doc.iter(iterate):
450 if condition(el):
451 bad.append(el)
452 for el in bad:
453 el.drop_tree()
454
462
463 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
464
466 """
467 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
468 can get interpreted, or ``expre/* stuff */ssion(...)``. This
469 checks for attempt to do stuff like this.
470
471 Typically the response will be to kill the entire style; if you
472 have just a bit of Javascript in the style another rule will catch
473 that and remove only the Javascript from the style; this catches
474 more sneaky attempts.
475 """
476 style = self._substitute_comments('', style)
477 style = style.replace('\\', '')
478 style = _substitute_whitespace('', style)
479 style = style.lower()
480 if 'javascript:' in style:
481 return True
482 if 'expression(' in style:
483 return True
484 return False
485
494
495 clean = Cleaner()
496 clean_html = clean.clean_html
497
498
499
500
501
502 _link_regexes = [
503 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
504
505 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
506 ]
507
508 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
509
510 _avoid_hosts = [
511 re.compile(r'^localhost', re.I),
512 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
513 re.compile(r'^127\.0\.0\.1$'),
514 ]
515
516 _avoid_classes = ['nolink']
517
522 """
523 Turn any URLs into links.
524
525 It will search for links identified by the given regular
526 expressions (by default mailto and http(s) links).
527
528 It won't link text in an element in avoid_elements, or an element
529 with a class in avoid_classes. It won't link to anything with a
530 host that matches one of the regular expressions in avoid_hosts
531 (default localhost and 127.0.0.1).
532
533 If you pass in an element, the element's tail will not be
534 substituted, only the contents of the element.
535 """
536 if el.tag in avoid_elements:
537 return
538 class_name = el.get('class')
539 if class_name:
540 class_name = class_name.split()
541 for match_class in avoid_classes:
542 if match_class in class_name:
543 return
544 for child in list(el):
545 autolink(child, link_regexes=link_regexes,
546 avoid_elements=avoid_elements,
547 avoid_hosts=avoid_hosts,
548 avoid_classes=avoid_classes)
549 if child.tail:
550 text, tail_children = _link_text(
551 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
552 if tail_children:
553 child.tail = text
554 index = el.index(child)
555 el[index+1:index+1] = tail_children
556 if el.text:
557 text, pre_children = _link_text(
558 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
559 if pre_children:
560 el.text = text
561 el[:0] = pre_children
562
563 -def _link_text(text, link_regexes, avoid_hosts, factory):
564 leading_text = ''
565 links = []
566 last_pos = 0
567 while 1:
568 best_match, best_pos = None, None
569 for regex in link_regexes:
570 regex_pos = last_pos
571 while 1:
572 match = regex.search(text, pos=regex_pos)
573 if match is None:
574 break
575 host = match.group('host')
576 for host_regex in avoid_hosts:
577 if host_regex.search(host):
578 regex_pos = match.end()
579 break
580 else:
581 break
582 if match is None:
583 continue
584 if best_pos is None or match.start() < best_pos:
585 best_match = match
586 best_pos = match.start()
587 if best_match is None:
588
589 if links:
590 assert not links[-1].tail
591 links[-1].tail = text
592 else:
593 assert not leading_text
594 leading_text = text
595 break
596 link = best_match.group(0)
597 end = best_match.end()
598 if link.endswith('.') or link.endswith(','):
599
600 end -= 1
601 link = link[:-1]
602 prev_text = text[:best_match.start()]
603 if links:
604 assert not links[-1].tail
605 links[-1].tail = prev_text
606 else:
607 assert not leading_text
608 leading_text = prev_text
609 anchor = factory('a')
610 anchor.set('href', link)
611 body = best_match.group('body')
612 if not body:
613 body = link
614 if body.endswith('.') or body.endswith(','):
615 body = body[:-1]
616 anchor.text = body
617 links.append(anchor)
618 text = text[end:]
619 return leading_text, links
620
629
630 autolink_html.__doc__ = autolink.__doc__
631
632
633
634
635
636 _avoid_word_break_elements = ['pre', 'textarea', 'code']
637 _avoid_word_break_classes = ['nobreak']
638
643 """
644 Breaks any long words found in the body of the text (not attributes).
645
646 Doesn't effect any of the tags in avoid_elements, by default
647 ``<textarea>`` and ``<pre>``
648
649 Breaks words by inserting ​, which is a unicode character
650 for Zero Width Space character. This generally takes up no space
651 in rendering, but does copy as a space, and in monospace contexts
652 usually takes up space.
653
654 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
655 """
656
657
658 if el.tag in _avoid_word_break_elements:
659 return
660 class_name = el.get('class')
661 if class_name:
662 dont_break = False
663 class_name = class_name.split()
664 for avoid in avoid_classes:
665 if avoid in class_name:
666 dont_break = True
667 break
668 if dont_break:
669 return
670 if el.text:
671 el.text = _break_text(el.text, max_width, break_character)
672 for child in el:
673 word_break(child, max_width=max_width,
674 avoid_elements=avoid_elements,
675 avoid_classes=avoid_classes,
676 break_character=break_character)
677 if child.tail:
678 child.tail = _break_text(child.tail, max_width, break_character)
679
685
686 -def _break_text(text, max_width, break_character):
687 words = text.split()
688 for word in words:
689 if len(word) > max_width:
690 replacement = _insert_break(word, max_width, break_character)
691 text = text.replace(word, replacement)
692 return text
693
694 _break_prefer_re = re.compile(r'[^a-z]', re.I)
695
697 orig_word = word
698 result = ''
699 while len(word) > width:
700 start = word[:width]
701 breaks = list(_break_prefer_re.finditer(start))
702 if breaks:
703 last_break = breaks[-1]
704
705 if last_break.end() > width-10:
706
707
708 start = word[:last_break.end()]
709 result += start + break_character
710 word = word[len(start):]
711 result += word
712 return result
713