1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _javascript_scheme_re = re.compile(
74 r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
75 _substitute_whitespace = re.compile(r'\s+').sub
76
77
78
79 _conditional_comment_re = re.compile(
80 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
81
82 _find_styled_elements = etree.XPath(
83 "descendant-or-self::*[@style]")
84
85 _find_external_links = etree.XPath(
86 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
87 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
88 namespaces={'x':XHTML_NAMESPACE})
89
91 """
92 Instances cleans the document of each of the possible offending
93 elements. The cleaning is controlled by attributes; you can
94 override attributes in a subclass, or set them in the constructor.
95
96 ``scripts``:
97 Removes any ``<script>`` tags.
98
99 ``javascript``:
100 Removes any Javascript, like an ``onclick`` attribute.
101
102 ``comments``:
103 Removes any comments.
104
105 ``style``:
106 Removes any style tags or attributes.
107
108 ``links``:
109 Removes any ``<link>`` tags
110
111 ``meta``:
112 Removes any ``<meta>`` tags
113
114 ``page_structure``:
115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
116
117 ``processing_instructions``:
118 Removes any processing instructions.
119
120 ``embedded``:
121 Removes any embedded objects (flash, iframes)
122
123 ``frames``:
124 Removes any frame-related tags
125
126 ``forms``:
127 Removes any form tags
128
129 ``annoying_tags``:
130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
131
132 ``remove_tags``:
133 A list of tags to remove. Only the tags will be removed,
134 their content will get pulled up into the parent tag.
135
136 ``kill_tags``:
137 A list of tags to kill. Killing also removes the tag's content,
138 i.e. the whole subtree, not just the tag itself.
139
140 ``allow_tags``:
141 A list of tags to include (default include all).
142
143 ``remove_unknown_tags``:
144 Remove any tags that aren't standard parts of HTML.
145
146 ``safe_attrs_only``:
147 If true, only include 'safe' attributes (specifically the list
148 from the feedparser HTML sanitisation web site).
149
150 ``add_nofollow``:
151 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
152
153 ``host_whitelist``:
154 A list or set of hosts that you can use for embedded content
155 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
156 You can also implement/override the method
157 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
158 implement more complex rules for what can be embedded.
159 Anything that passes this test will be shown, regardless of
160 the value of (for instance) ``embedded``.
161
162 Note that this parameter might not work as intended if you do not
163 make the links absolute before doing the cleaning.
164
165 ``whitelist_tags``:
166 A set of tags that can be included with ``host_whitelist``.
167 The default is ``iframe`` and ``embed``; you may wish to
168 include other tags like ``script``, or you may want to
169 implement ``allow_embedded_url`` for more control. Set to None to
170 include all tags.
171
172 This modifies the document *in place*.
173 """
174
175 scripts = True
176 javascript = True
177 comments = True
178 style = False
179 links = True
180 meta = True
181 page_structure = True
182 processing_instructions = True
183 embedded = True
184 frames = True
185 forms = True
186 annoying_tags = True
187 remove_tags = None
188 allow_tags = None
189 kill_tags = None
190 remove_unknown_tags = True
191 safe_attrs_only = True
192 add_nofollow = False
193 host_whitelist = ()
194 whitelist_tags = set(['iframe', 'embed'])
195
202
203
204
205 _tag_link_attrs = dict(
206 script='src',
207 link='href',
208
209
210 applet=['code', 'object'],
211 iframe='src',
212 embed='src',
213 layer='src',
214
215
216
217
218
219
220
221
222 a='href',
223 )
224
226 """
227 Cleans the document.
228 """
229 if hasattr(doc, 'getroot'):
230
231 doc = doc.getroot()
232
233 xhtml_to_html(doc)
234
235
236 for el in doc.iter('image'):
237 el.tag = 'img'
238 if not self.comments:
239
240
241 self.kill_conditional_comments(doc)
242
243 kill_tags = set(self.kill_tags or ())
244 remove_tags = set(self.remove_tags or ())
245 allow_tags = set(self.allow_tags or ())
246
247 if self.scripts:
248 kill_tags.add('script')
249 if self.safe_attrs_only:
250 safe_attrs = set(defs.safe_attrs)
251 for el in doc.iter():
252 attrib = el.attrib
253 for aname in attrib.keys():
254 if aname not in safe_attrs:
255 del attrib[aname]
256 if self.javascript:
257 if not self.safe_attrs_only:
258
259 for el in doc.iter():
260 attrib = el.attrib
261 for aname in attrib.keys():
262 if aname.startswith('on'):
263 del attrib[aname]
264 doc.rewrite_links(self._remove_javascript_link,
265 resolve_base_href=False)
266 if not self.style:
267
268
269 for el in _find_styled_elements(doc):
270 old = el.get('style')
271 new = _css_javascript_re.sub('', old)
272 new = _css_import_re.sub('', old)
273 if self._has_sneaky_javascript(new):
274
275 del el.attrib['style']
276 elif new != old:
277 el.set('style', new)
278 for el in list(doc.iter('style')):
279 if el.get('type', '').lower().strip() == 'text/javascript':
280 el.drop_tree()
281 continue
282 old = el.text or ''
283 new = _css_javascript_re.sub('', old)
284
285 new = _css_import_re.sub('', old)
286 if self._has_sneaky_javascript(new):
287
288 el.text = '/* deleted */'
289 elif new != old:
290 el.text = new
291 if self.comments or self.processing_instructions:
292
293
294
295 kill_tags.add(etree.Comment)
296 if self.processing_instructions:
297 kill_tags.add(etree.ProcessingInstruction)
298 if self.style:
299 kill_tags.add('style')
300 etree.strip_attributes(doc, 'style')
301 if self.links:
302 kill_tags.add('link')
303 elif self.style or self.javascript:
304
305
306 for el in list(doc.iter('link')):
307 if 'stylesheet' in el.get('rel', '').lower():
308
309 el.drop_tree()
310 if self.meta:
311 kill_tags.add('meta')
312 if self.page_structure:
313 remove_tags.update(('head', 'html', 'title'))
314 if self.embedded:
315
316
317
318 for el in list(doc.iter('param')):
319 found_parent = False
320 parent = el.getparent()
321 while parent is not None and parent.tag not in ('applet', 'object'):
322 parent = parent.getparent()
323 if parent is None:
324 el.drop_tree()
325 kill_tags.update(('applet',))
326
327 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
328 if self.frames:
329
330
331
332 kill_tags.update(defs.frame_tags)
333 if self.forms:
334 remove_tags.add('form')
335 kill_tags.update(('button', 'input', 'select', 'textarea'))
336 if self.annoying_tags:
337 remove_tags.update(('blink', 'marquee'))
338
339 _remove = []
340 _kill = []
341 for el in doc.iter():
342 if el.tag in kill_tags:
343 if self.allow_element(el):
344 continue
345 _kill.append(el)
346 elif el.tag in remove_tags:
347 if self.allow_element(el):
348 continue
349 _remove.append(el)
350
351 if _remove and _remove[0] == doc:
352
353
354 el = _remove.pop(0)
355 el.tag = 'div'
356 el.attrib.clear()
357 elif _kill and _kill[0] == doc:
358
359
360 el = _kill.pop(0)
361 if el.tag != 'html':
362 el.tag = 'div'
363 el.clear()
364
365 _kill.reverse()
366 for el in _kill:
367 el.drop_tree()
368 for el in _remove:
369 el.drop_tag()
370
371 allow_tags = self.allow_tags
372 if self.remove_unknown_tags:
373 if allow_tags:
374 raise ValueError(
375 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
376 allow_tags = set(defs.tags)
377 if allow_tags:
378 bad = []
379 for el in doc.iter():
380 if el.tag not in allow_tags:
381 bad.append(el)
382 if bad:
383 if bad[0] is doc:
384 el = bad.pop(0)
385 el.tag = 'div'
386 el.attrib.clear()
387 for el in bad:
388 el.drop_tag()
389 if self.add_nofollow:
390 for el in _find_external_links(doc):
391 if not self.allow_follow(el):
392 el.set('rel', 'nofollow')
393
395 """
396 Override to suppress rel="nofollow" on some anchors.
397 """
398 return False
399
401 if el.tag not in self._tag_link_attrs:
402 return False
403 attr = self._tag_link_attrs[el.tag]
404 if isinstance(attr, (list, tuple)):
405 for one_attr in attr:
406 url = el.get(one_attr)
407 if not url:
408 return False
409 if not self.allow_embedded_url(el, url):
410 return False
411 return True
412 else:
413 url = el.get(attr)
414 if not url:
415 return False
416 return self.allow_embedded_url(el, url)
417
419 if (self.whitelist_tags is not None
420 and el.tag not in self.whitelist_tags):
421 return False
422 scheme, netloc, path, query, fragment = urlsplit(url)
423 netloc = netloc.lower().split(':', 1)[0]
424 if scheme not in ('http', 'https'):
425 return False
426 if netloc in self.host_whitelist:
427 return True
428 return False
429
440
442 bad = []
443 for el in doc.iter(iterate):
444 if condition(el):
445 bad.append(el)
446 for el in bad:
447 el.drop_tree()
448
456
457 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
458
460 """
461 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
462 can get interpreted, or ``expre/* stuff */ssion(...)``. This
463 checks for attempt to do stuff like this.
464
465 Typically the response will be to kill the entire style; if you
466 have just a bit of Javascript in the style another rule will catch
467 that and remove only the Javascript from the style; this catches
468 more sneaky attempts.
469 """
470 style = self._substitute_comments('', style)
471 style = style.replace('\\', '')
472 style = _substitute_whitespace('', style)
473 style = style.lower()
474 if 'javascript:' in style:
475 return True
476 if 'expression(' in style:
477 return True
478 return False
479
488
489 clean = Cleaner()
490 clean_html = clean.clean_html
491
492
493
494
495
496 _link_regexes = [
497 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
498
499 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
500 ]
501
502 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
503
504 _avoid_hosts = [
505 re.compile(r'^localhost', re.I),
506 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
507 re.compile(r'^127\.0\.0\.1$'),
508 ]
509
510 _avoid_classes = ['nolink']
511
516 """
517 Turn any URLs into links.
518
519 It will search for links identified by the given regular
520 expressions (by default mailto and http(s) links).
521
522 It won't link text in an element in avoid_elements, or an element
523 with a class in avoid_classes. It won't link to anything with a
524 host that matches one of the regular expressions in avoid_hosts
525 (default localhost and 127.0.0.1).
526
527 If you pass in an element, the element's tail will not be
528 substituted, only the contents of the element.
529 """
530 if el.tag in avoid_elements:
531 return
532 class_name = el.get('class')
533 if class_name:
534 class_name = class_name.split()
535 for match_class in avoid_classes:
536 if match_class in class_name:
537 return
538 for child in list(el):
539 autolink(child, link_regexes=link_regexes,
540 avoid_elements=avoid_elements,
541 avoid_hosts=avoid_hosts,
542 avoid_classes=avoid_classes)
543 if child.tail:
544 text, tail_children = _link_text(
545 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
546 if tail_children:
547 child.tail = text
548 index = el.index(child)
549 el[index+1:index+1] = tail_children
550 if el.text:
551 text, pre_children = _link_text(
552 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
553 if pre_children:
554 el.text = text
555 el[:0] = pre_children
556
557 -def _link_text(text, link_regexes, avoid_hosts, factory):
558 leading_text = ''
559 links = []
560 last_pos = 0
561 while 1:
562 best_match, best_pos = None, None
563 for regex in link_regexes:
564 regex_pos = last_pos
565 while 1:
566 match = regex.search(text, pos=regex_pos)
567 if match is None:
568 break
569 host = match.group('host')
570 for host_regex in avoid_hosts:
571 if host_regex.search(host):
572 regex_pos = match.end()
573 break
574 else:
575 break
576 if match is None:
577 continue
578 if best_pos is None or match.start() < best_pos:
579 best_match = match
580 best_pos = match.start()
581 if best_match is None:
582
583 if links:
584 assert not links[-1].tail
585 links[-1].tail = text
586 else:
587 assert not leading_text
588 leading_text = text
589 break
590 link = best_match.group(0)
591 end = best_match.end()
592 if link.endswith('.') or link.endswith(','):
593
594 end -= 1
595 link = link[:-1]
596 prev_text = text[:best_match.start()]
597 if links:
598 assert not links[-1].tail
599 links[-1].tail = prev_text
600 else:
601 assert not leading_text
602 leading_text = prev_text
603 anchor = factory('a')
604 anchor.set('href', link)
605 body = best_match.group('body')
606 if not body:
607 body = link
608 if body.endswith('.') or body.endswith(','):
609 body = body[:-1]
610 anchor.text = body
611 links.append(anchor)
612 text = text[end:]
613 return leading_text, links
614
623
624 autolink_html.__doc__ = autolink.__doc__
625
626
627
628
629
630 _avoid_word_break_elements = ['pre', 'textarea', 'code']
631 _avoid_word_break_classes = ['nobreak']
632
637 """
638 Breaks any long words found in the body of the text (not attributes).
639
640 Doesn't effect any of the tags in avoid_elements, by default
641 ``<textarea>`` and ``<pre>``
642
643 Breaks words by inserting ​, which is a unicode character
644 for Zero Width Space character. This generally takes up no space
645 in rendering, but does copy as a space, and in monospace contexts
646 usually takes up space.
647
648 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
649 """
650
651
652 if el.tag in _avoid_word_break_elements:
653 return
654 class_name = el.get('class')
655 if class_name:
656 dont_break = False
657 class_name = class_name.split()
658 for avoid in avoid_classes:
659 if avoid in class_name:
660 dont_break = True
661 break
662 if dont_break:
663 return
664 if el.text:
665 el.text = _break_text(el.text, max_width, break_character)
666 for child in el:
667 word_break(child, max_width=max_width,
668 avoid_elements=avoid_elements,
669 avoid_classes=avoid_classes,
670 break_character=break_character)
671 if child.tail:
672 child.tail = _break_text(child.tail, max_width, break_character)
673
679
680 -def _break_text(text, max_width, break_character):
681 words = text.split()
682 for word in words:
683 if len(word) > max_width:
684 replacement = _insert_break(word, max_width, break_character)
685 text = text.replace(word, replacement)
686 return text
687
688 _break_prefer_re = re.compile(r'[^a-z]', re.I)
689
691 orig_word = word
692 result = ''
693 while len(word) > width:
694 start = word[:width]
695 breaks = list(_break_prefer_re.finditer(start))
696 if breaks:
697 last_break = breaks[-1]
698
699 if last_break.end() > width-10:
700
701
702 start = word[:last_break.end()]
703 result += start + break_character
704 word = word[len(start):]
705 result += word
706 return result
707