Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  from __future__ import absolute_import 
  8   
  9  import re 
 10  import copy 
 11  try: 
 12      from urlparse import urlsplit 
 13      from urllib import unquote_plus 
 14  except ImportError: 
 15      # Python 3 
 16      from urllib.parse import urlsplit, unquote_plus 
 17  from lxml import etree 
 18  from lxml.html import defs 
 19  from lxml.html import fromstring, XHTML_NAMESPACE 
 20  from lxml.html import xhtml_to_html, _transform_result 
 21   
 22  try: 
 23      unichr 
 24  except NameError: 
 25      # Python 3 
 26      unichr = chr 
 27  try: 
 28      unicode 
 29  except NameError: 
 30      # Python 3 
 31      unicode = str 
 32  try: 
 33      bytes 
 34  except NameError: 
 35      # Python < 2.6 
 36      bytes = str 
 37  try: 
 38      basestring 
 39  except NameError: 
 40      basestring = (str, bytes) 
 41   
 42   
 43  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 44             'word_break', 'word_break_html'] 
 45   
 46  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 47  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 48  # I have multiple kinds of schemes searched; but should schemes be 
 49  #   whitelisted instead? 
 50  # max height? 
 51  # remove images?  Also in CSS?  background attribute? 
 52  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 53  #   allow *just* embedded YouTube movies) 
 54  # Log what was deleted and why? 
 55  # style="behavior: ..." might be bad in IE? 
 56  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 57  #   metas. 
 58  # UTF-7 detections?  Example: 
 59  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 60  #   you don't always have to have the charset set, if the page has no charset 
 61  #   and there's UTF7-like code in it. 
 62  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 63   
 64   
 65  # This is an IE-specific construct you can have in a stylesheet to 
 66  # run some Javascript: 
 67  _css_javascript_re = re.compile( 
 68      r'expression\s*\(.*?\)', re.S|re.I) 
 69   
 70  # Do I have to worry about @\nimport? 
 71  _css_import_re = re.compile( 
 72      r'@\s*import', re.I) 
 73   
 74  # All kinds of schemes besides just javascript: that can cause 
 75  # execution: 
 76  _is_image_dataurl = re.compile( 
 77      r'^data:image/.+;base64', re.I).search 
 78  _is_possibly_malicious_scheme = re.compile( 
 79      r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 
 80      re.I).search 
81 -def _is_javascript_scheme(s):
82 if _is_image_dataurl(s): 83 return None 84 return _is_possibly_malicious_scheme(s)
85 86 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub 87 # FIXME: should data: be blocked? 88 89 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 90 _conditional_comment_re = re.compile( 91 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 92 93 _find_styled_elements = etree.XPath( 94 "descendant-or-self::*[@style]") 95 96 _find_external_links = etree.XPath( 97 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 98 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 99 namespaces={'x':XHTML_NAMESPACE}) 100 101
102 -class Cleaner(object):
103 """ 104 Instances cleans the document of each of the possible offending 105 elements. The cleaning is controlled by attributes; you can 106 override attributes in a subclass, or set them in the constructor. 107 108 ``scripts``: 109 Removes any ``<script>`` tags. 110 111 ``javascript``: 112 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 113 as they could contain Javascript. 114 115 ``comments``: 116 Removes any comments. 117 118 ``style``: 119 Removes any style tags. 120 121 ``inline_style`` 122 Removes any style attributes. Defaults to the value of the ``style`` option. 123 124 ``links``: 125 Removes any ``<link>`` tags 126 127 ``meta``: 128 Removes any ``<meta>`` tags 129 130 ``page_structure``: 131 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 132 133 ``processing_instructions``: 134 Removes any processing instructions. 135 136 ``embedded``: 137 Removes any embedded objects (flash, iframes) 138 139 ``frames``: 140 Removes any frame-related tags 141 142 ``forms``: 143 Removes any form tags 144 145 ``annoying_tags``: 146 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 147 148 ``remove_tags``: 149 A list of tags to remove. Only the tags will be removed, 150 their content will get pulled up into the parent tag. 151 152 ``kill_tags``: 153 A list of tags to kill. Killing also removes the tag's content, 154 i.e. the whole subtree, not just the tag itself. 155 156 ``allow_tags``: 157 A list of tags to include (default include all). 158 159 ``remove_unknown_tags``: 160 Remove any tags that aren't standard parts of HTML. 161 162 ``safe_attrs_only``: 163 If true, only include 'safe' attributes (specifically the list 164 from the feedparser HTML sanitisation web site). 165 166 ``safe_attrs``: 167 A set of attribute names to override the default list of attributes 168 considered 'safe' (when safe_attrs_only=True). 169 170 ``add_nofollow``: 171 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 172 173 ``host_whitelist``: 174 A list or set of hosts that you can use for embedded content 175 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 176 You can also implement/override the method 177 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 178 implement more complex rules for what can be embedded. 179 Anything that passes this test will be shown, regardless of 180 the value of (for instance) ``embedded``. 181 182 Note that this parameter might not work as intended if you do not 183 make the links absolute before doing the cleaning. 184 185 Note that you may also need to set ``whitelist_tags``. 186 187 ``whitelist_tags``: 188 A set of tags that can be included with ``host_whitelist``. 189 The default is ``iframe`` and ``embed``; you may wish to 190 include other tags like ``script``, or you may want to 191 implement ``allow_embedded_url`` for more control. Set to None to 192 include all tags. 193 194 This modifies the document *in place*. 195 """ 196 197 scripts = True 198 javascript = True 199 comments = True 200 style = False 201 inline_style = None 202 links = True 203 meta = True 204 page_structure = True 205 processing_instructions = True 206 embedded = True 207 frames = True 208 forms = True 209 annoying_tags = True 210 remove_tags = None 211 allow_tags = None 212 kill_tags = None 213 remove_unknown_tags = True 214 safe_attrs_only = True 215 safe_attrs = defs.safe_attrs 216 add_nofollow = False 217 host_whitelist = () 218 whitelist_tags = set(['iframe', 'embed']) 219
220 - def __init__(self, **kw):
221 for name, value in kw.items(): 222 if not hasattr(self, name): 223 raise TypeError( 224 "Unknown parameter: %s=%r" % (name, value)) 225 setattr(self, name, value) 226 if self.inline_style is None and 'inline_style' not in kw: 227 self.inline_style = self.style
228 229 # Used to lookup the primary URL for a given tag that is up for 230 # removal: 231 _tag_link_attrs = dict( 232 script='src', 233 link='href', 234 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 235 # From what I can tell, both attributes can contain a link: 236 applet=['code', 'object'], 237 iframe='src', 238 embed='src', 239 layer='src', 240 # FIXME: there doesn't really seem like a general way to figure out what 241 # links an <object> tag uses; links often go in <param> tags with values 242 # that we don't really know. You'd have to have knowledge about specific 243 # kinds of plugins (probably keyed off classid), and match against those. 244 ##object=?, 245 # FIXME: not looking at the action currently, because it is more complex 246 # than than -- if you keep the form, you should keep the form controls. 247 ##form='action', 248 a='href', 249 ) 250
251 - def __call__(self, doc):
252 """ 253 Cleans the document. 254 """ 255 if hasattr(doc, 'getroot'): 256 # ElementTree instance, instead of an element 257 doc = doc.getroot() 258 # convert XHTML to HTML 259 xhtml_to_html(doc) 260 # Normalize a case that IE treats <image> like <img>, and that 261 # can confuse either this step or later steps. 262 for el in doc.iter('image'): 263 el.tag = 'img' 264 if not self.comments: 265 # Of course, if we were going to kill comments anyway, we don't 266 # need to worry about this 267 self.kill_conditional_comments(doc) 268 269 kill_tags = set(self.kill_tags or ()) 270 remove_tags = set(self.remove_tags or ()) 271 allow_tags = set(self.allow_tags or ()) 272 273 if self.scripts: 274 kill_tags.add('script') 275 if self.safe_attrs_only: 276 safe_attrs = set(self.safe_attrs) 277 for el in doc.iter(etree.Element): 278 attrib = el.attrib 279 for aname in attrib.keys(): 280 if aname not in safe_attrs: 281 del attrib[aname] 282 if self.javascript: 283 if not (self.safe_attrs_only and 284 self.safe_attrs == defs.safe_attrs): 285 # safe_attrs handles events attributes itself 286 for el in doc.iter(etree.Element): 287 attrib = el.attrib 288 for aname in attrib.keys(): 289 if aname.startswith('on'): 290 del attrib[aname] 291 doc.rewrite_links(self._remove_javascript_link, 292 resolve_base_href=False) 293 # If we're deleting style then we don't have to remove JS links 294 # from styles, otherwise... 295 if not self.inline_style: 296 for el in _find_styled_elements(doc): 297 old = el.get('style') 298 new = _css_javascript_re.sub('', old) 299 new = _css_import_re.sub('', new) 300 if self._has_sneaky_javascript(new): 301 # Something tricky is going on... 302 del el.attrib['style'] 303 elif new != old: 304 el.set('style', new) 305 if not self.style: 306 for el in list(doc.iter('style')): 307 if el.get('type', '').lower().strip() == 'text/javascript': 308 el.drop_tree() 309 continue 310 old = el.text or '' 311 new = _css_javascript_re.sub('', old) 312 # The imported CSS can do anything; we just can't allow: 313 new = _css_import_re.sub('', old) 314 if self._has_sneaky_javascript(new): 315 # Something tricky is going on... 316 el.text = '/* deleted */' 317 elif new != old: 318 el.text = new 319 if self.comments or self.processing_instructions: 320 # FIXME: why either? I feel like there's some obscure reason 321 # because you can put PIs in comments...? But I've already 322 # forgotten it 323 kill_tags.add(etree.Comment) 324 if self.processing_instructions: 325 kill_tags.add(etree.ProcessingInstruction) 326 if self.style: 327 kill_tags.add('style') 328 if self.inline_style: 329 etree.strip_attributes(doc, 'style') 330 if self.links: 331 kill_tags.add('link') 332 elif self.style or self.javascript: 333 # We must get rid of included stylesheets if Javascript is not 334 # allowed, as you can put Javascript in them 335 for el in list(doc.iter('link')): 336 if 'stylesheet' in el.get('rel', '').lower(): 337 # Note this kills alternate stylesheets as well 338 if not self.allow_element(el): 339 el.drop_tree() 340 if self.meta: 341 kill_tags.add('meta') 342 if self.page_structure: 343 remove_tags.update(('head', 'html', 'title')) 344 if self.embedded: 345 # FIXME: is <layer> really embedded? 346 # We should get rid of any <param> tags not inside <applet>; 347 # These are not really valid anyway. 348 for el in list(doc.iter('param')): 349 found_parent = False 350 parent = el.getparent() 351 while parent is not None and parent.tag not in ('applet', 'object'): 352 parent = parent.getparent() 353 if parent is None: 354 el.drop_tree() 355 kill_tags.update(('applet',)) 356 # The alternate contents that are in an iframe are a good fallback: 357 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 358 if self.frames: 359 # FIXME: ideally we should look at the frame links, but 360 # generally frames don't mix properly with an HTML 361 # fragment anyway. 362 kill_tags.update(defs.frame_tags) 363 if self.forms: 364 remove_tags.add('form') 365 kill_tags.update(('button', 'input', 'select', 'textarea')) 366 if self.annoying_tags: 367 remove_tags.update(('blink', 'marquee')) 368 369 _remove = [] 370 _kill = [] 371 for el in doc.iter(): 372 if el.tag in kill_tags: 373 if self.allow_element(el): 374 continue 375 _kill.append(el) 376 elif el.tag in remove_tags: 377 if self.allow_element(el): 378 continue 379 _remove.append(el) 380 381 if _remove and _remove[0] == doc: 382 # We have to drop the parent-most tag, which we can't 383 # do. Instead we'll rewrite it: 384 el = _remove.pop(0) 385 el.tag = 'div' 386 el.attrib.clear() 387 elif _kill and _kill[0] == doc: 388 # We have to drop the parent-most element, which we can't 389 # do. Instead we'll clear it: 390 el = _kill.pop(0) 391 if el.tag != 'html': 392 el.tag = 'div' 393 el.clear() 394 395 _kill.reverse() # start with innermost tags 396 for el in _kill: 397 el.drop_tree() 398 for el in _remove: 399 el.drop_tag() 400 401 if self.remove_unknown_tags: 402 if allow_tags: 403 raise ValueError( 404 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 405 allow_tags = set(defs.tags) 406 if allow_tags: 407 bad = [] 408 for el in doc.iter(): 409 if el.tag not in allow_tags: 410 bad.append(el) 411 if bad: 412 if bad[0] is doc: 413 el = bad.pop(0) 414 el.tag = 'div' 415 el.attrib.clear() 416 for el in bad: 417 el.drop_tag() 418 if self.add_nofollow: 419 for el in _find_external_links(doc): 420 if not self.allow_follow(el): 421 rel = el.get('rel') 422 if rel: 423 if ('nofollow' in rel 424 and ' nofollow ' in (' %s ' % rel)): 425 continue 426 rel = '%s nofollow' % rel 427 else: 428 rel = 'nofollow' 429 el.set('rel', rel)
430
431 - def allow_follow(self, anchor):
432 """ 433 Override to suppress rel="nofollow" on some anchors. 434 """ 435 return False
436
437 - def allow_element(self, el):
438 if el.tag not in self._tag_link_attrs: 439 return False 440 attr = self._tag_link_attrs[el.tag] 441 if isinstance(attr, (list, tuple)): 442 for one_attr in attr: 443 url = el.get(one_attr) 444 if not url: 445 return False 446 if not self.allow_embedded_url(el, url): 447 return False 448 return True 449 else: 450 url = el.get(attr) 451 if not url: 452 return False 453 return self.allow_embedded_url(el, url)
454
455 - def allow_embedded_url(self, el, url):
456 if (self.whitelist_tags is not None 457 and el.tag not in self.whitelist_tags): 458 return False 459 scheme, netloc, path, query, fragment = urlsplit(url) 460 netloc = netloc.lower().split(':', 1)[0] 461 if scheme not in ('http', 'https'): 462 return False 463 if netloc in self.host_whitelist: 464 return True 465 return False
466
467 - def kill_conditional_comments(self, doc):
468 """ 469 IE conditional comments basically embed HTML that the parser 470 doesn't normally see. We can't allow anything like that, so 471 we'll kill any comments that could be conditional. 472 """ 473 bad = [] 474 self._kill_elements( 475 doc, lambda el: _conditional_comment_re.search(el.text), 476 etree.Comment)
477
478 - def _kill_elements(self, doc, condition, iterate=None):
479 bad = [] 480 for el in doc.iter(iterate): 481 if condition(el): 482 bad.append(el) 483 for el in bad: 484 el.drop_tree()
485 493 494 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 495
496 - def _has_sneaky_javascript(self, style):
497 """ 498 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 499 can get interpreted, or ``expre/* stuff */ssion(...)``. This 500 checks for attempt to do stuff like this. 501 502 Typically the response will be to kill the entire style; if you 503 have just a bit of Javascript in the style another rule will catch 504 that and remove only the Javascript from the style; this catches 505 more sneaky attempts. 506 """ 507 style = self._substitute_comments('', style) 508 style = style.replace('\\', '') 509 style = _substitute_whitespace('', style) 510 style = style.lower() 511 if 'javascript:' in style: 512 return True 513 if 'expression(' in style: 514 return True 515 return False
516
517 - def clean_html(self, html):
518 result_type = type(html) 519 if isinstance(html, basestring): 520 doc = fromstring(html) 521 else: 522 doc = copy.deepcopy(html) 523 self(doc) 524 return _transform_result(result_type, doc)
525 526 clean = Cleaner() 527 clean_html = clean.clean_html 528 529 ############################################################ 530 ## Autolinking 531 ############################################################ 532 533 _link_regexes = [ 534 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 535 # This is conservative, but autolinking can be a bit conservative: 536 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I), 537 ] 538 539 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 540 541 _avoid_hosts = [ 542 re.compile(r'^localhost', re.I), 543 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 544 re.compile(r'^127\.0\.0\.1$'), 545 ] 546 547 _avoid_classes = ['nolink'] 548 593 651 660 661 autolink_html.__doc__ = autolink.__doc__ 662 663 ############################################################ 664 ## Word wrapping 665 ############################################################ 666 667 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 668 _avoid_word_break_classes = ['nobreak'] 669
670 -def word_break(el, max_width=40, 671 avoid_elements=_avoid_word_break_elements, 672 avoid_classes=_avoid_word_break_classes, 673 break_character=unichr(0x200b)):
674 """ 675 Breaks any long words found in the body of the text (not attributes). 676 677 Doesn't effect any of the tags in avoid_elements, by default 678 ``<textarea>`` and ``<pre>`` 679 680 Breaks words by inserting &#8203;, which is a unicode character 681 for Zero Width Space character. This generally takes up no space 682 in rendering, but does copy as a space, and in monospace contexts 683 usually takes up space. 684 685 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 686 """ 687 # Character suggestion of &#8203 comes from: 688 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 689 if el.tag in _avoid_word_break_elements: 690 return 691 class_name = el.get('class') 692 if class_name: 693 dont_break = False 694 class_name = class_name.split() 695 for avoid in avoid_classes: 696 if avoid in class_name: 697 dont_break = True 698 break 699 if dont_break: 700 return 701 if el.text: 702 el.text = _break_text(el.text, max_width, break_character) 703 for child in el: 704 word_break(child, max_width=max_width, 705 avoid_elements=avoid_elements, 706 avoid_classes=avoid_classes, 707 break_character=break_character) 708 if child.tail: 709 child.tail = _break_text(child.tail, max_width, break_character)
710
711 -def word_break_html(html, *args, **kw):
712 result_type = type(html) 713 doc = fromstring(html) 714 word_break(doc, *args, **kw) 715 return _transform_result(result_type, doc)
716
717 -def _break_text(text, max_width, break_character):
718 words = text.split() 719 for word in words: 720 if len(word) > max_width: 721 replacement = _insert_break(word, max_width, break_character) 722 text = text.replace(word, replacement) 723 return text
724 725 _break_prefer_re = re.compile(r'[^a-z]', re.I) 726
727 -def _insert_break(word, width, break_character):
728 orig_word = word 729 result = '' 730 while len(word) > width: 731 start = word[:width] 732 breaks = list(_break_prefer_re.finditer(start)) 733 if breaks: 734 last_break = breaks[-1] 735 # Only walk back up to 10 characters to find a nice break: 736 if last_break.end() > width-10: 737 # FIXME: should the break character be at the end of the 738 # chunk, or the beginning of the next chunk? 739 start = word[:last_break.end()] 740 result += start + break_character 741 word = word[len(start):] 742 result += word 743 return result
744