Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22      # Python 3 
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27      # Python 3 
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32      # Python < 2.6 
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 44  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 45  # I have multiple kinds of schemes searched; but should schemes be 
 46  #   whitelisted instead? 
 47  # max height? 
 48  # remove images?  Also in CSS?  background attribute? 
 49  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 50  #   allow *just* embedded YouTube movies) 
 51  # Log what was deleted and why? 
 52  # style="behavior: ..." might be bad in IE? 
 53  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 54  #   metas. 
 55  # UTF-7 detections?  Example: 
 56  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 57  #   you don't always have to have the charset set, if the page has no charset 
 58  #   and there's UTF7-like code in it. 
 59  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 60   
 61   
 62  # This is an IE-specific construct you can have in a stylesheet to 
 63  # run some Javascript: 
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67  # Do I have to worry about @\nimport? 
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71  # All kinds of schemes besides just javascript: that can cause 
 72  # execution: 
 73  _is_image_dataurl = re.compile( 
 74      r'^data:image/.+;base64', re.I).search 
 75  _is_possibly_malicious_scheme = re.compile( 
 76      r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 
 77      re.I).search 
78 -def _is_javascript_scheme(s):
79 if _is_image_dataurl(s): 80 return None 81 return _is_possibly_malicious_scheme(s)
82 83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub 84 # FIXME: should data: be blocked? 85 86 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 87 _conditional_comment_re = re.compile( 88 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 89 90 _find_styled_elements = etree.XPath( 91 "descendant-or-self::*[@style]") 92 93 _find_external_links = etree.XPath( 94 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 95 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 96 namespaces={'x':XHTML_NAMESPACE}) 97
98 -class Cleaner(object):
99 """ 100 Instances cleans the document of each of the possible offending 101 elements. The cleaning is controlled by attributes; you can 102 override attributes in a subclass, or set them in the constructor. 103 104 ``scripts``: 105 Removes any ``<script>`` tags. 106 107 ``javascript``: 108 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 109 as they could contain Javascript. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 140 141 ``remove_tags``: 142 A list of tags to remove. Only the tags will be removed, 143 their content will get pulled up into the parent tag. 144 145 ``kill_tags``: 146 A list of tags to kill. Killing also removes the tag's content, 147 i.e. the whole subtree, not just the tag itself. 148 149 ``allow_tags``: 150 A list of tags to include (default include all). 151 152 ``remove_unknown_tags``: 153 Remove any tags that aren't standard parts of HTML. 154 155 ``safe_attrs_only``: 156 If true, only include 'safe' attributes (specifically the list 157 from the feedparser HTML sanitisation web site). 158 159 ``safe_attrs``: 160 A set of attribute names to override the default list of attributes 161 considered 'safe' (when safe_attrs_only=True). 162 163 ``add_nofollow``: 164 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 165 166 ``host_whitelist``: 167 A list or set of hosts that you can use for embedded content 168 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 169 You can also implement/override the method 170 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 171 implement more complex rules for what can be embedded. 172 Anything that passes this test will be shown, regardless of 173 the value of (for instance) ``embedded``. 174 175 Note that this parameter might not work as intended if you do not 176 make the links absolute before doing the cleaning. 177 178 Note that you may also need to set ``whitelist_tags``. 179 180 ``whitelist_tags``: 181 A set of tags that can be included with ``host_whitelist``. 182 The default is ``iframe`` and ``embed``; you may wish to 183 include other tags like ``script``, or you may want to 184 implement ``allow_embedded_url`` for more control. Set to None to 185 include all tags. 186 187 This modifies the document *in place*. 188 """ 189 190 scripts = True 191 javascript = True 192 comments = True 193 style = False 194 links = True 195 meta = True 196 page_structure = True 197 processing_instructions = True 198 embedded = True 199 frames = True 200 forms = True 201 annoying_tags = True 202 remove_tags = None 203 allow_tags = None 204 kill_tags = None 205 remove_unknown_tags = True 206 safe_attrs_only = True 207 safe_attrs = defs.safe_attrs 208 add_nofollow = False 209 host_whitelist = () 210 whitelist_tags = set(['iframe', 'embed']) 211
212 - def __init__(self, **kw):
213 for name, value in kw.items(): 214 if not hasattr(self, name): 215 raise TypeError( 216 "Unknown parameter: %s=%r" % (name, value)) 217 setattr(self, name, value)
218 219 # Used to lookup the primary URL for a given tag that is up for 220 # removal: 221 _tag_link_attrs = dict( 222 script='src', 223 link='href', 224 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 225 # From what I can tell, both attributes can contain a link: 226 applet=['code', 'object'], 227 iframe='src', 228 embed='src', 229 layer='src', 230 # FIXME: there doesn't really seem like a general way to figure out what 231 # links an <object> tag uses; links often go in <param> tags with values 232 # that we don't really know. You'd have to have knowledge about specific 233 # kinds of plugins (probably keyed off classid), and match against those. 234 ##object=?, 235 # FIXME: not looking at the action currently, because it is more complex 236 # than than -- if you keep the form, you should keep the form controls. 237 ##form='action', 238 a='href', 239 ) 240
241 - def __call__(self, doc):
242 """ 243 Cleans the document. 244 """ 245 if hasattr(doc, 'getroot'): 246 # ElementTree instance, instead of an element 247 doc = doc.getroot() 248 # convert XHTML to HTML 249 xhtml_to_html(doc) 250 # Normalize a case that IE treats <image> like <img>, and that 251 # can confuse either this step or later steps. 252 for el in doc.iter('image'): 253 el.tag = 'img' 254 if not self.comments: 255 # Of course, if we were going to kill comments anyway, we don't 256 # need to worry about this 257 self.kill_conditional_comments(doc) 258 259 kill_tags = set(self.kill_tags or ()) 260 remove_tags = set(self.remove_tags or ()) 261 allow_tags = set(self.allow_tags or ()) 262 263 if self.scripts: 264 kill_tags.add('script') 265 if self.safe_attrs_only: 266 safe_attrs = set(self.safe_attrs) 267 for el in doc.iter(etree.Element): 268 attrib = el.attrib 269 for aname in attrib.keys(): 270 if aname not in safe_attrs: 271 del attrib[aname] 272 if self.javascript: 273 if not (self.safe_attrs_only and 274 self.safe_attrs == defs.safe_attrs): 275 # safe_attrs handles events attributes itself 276 for el in doc.iter(etree.Element): 277 attrib = el.attrib 278 for aname in attrib.keys(): 279 if aname.startswith('on'): 280 del attrib[aname] 281 doc.rewrite_links(self._remove_javascript_link, 282 resolve_base_href=False) 283 if not self.style: 284 # If we're deleting style then we don't have to remove JS links 285 # from styles, otherwise... 286 for el in _find_styled_elements(doc): 287 old = el.get('style') 288 new = _css_javascript_re.sub('', old) 289 new = _css_import_re.sub('', new) 290 if self._has_sneaky_javascript(new): 291 # Something tricky is going on... 292 del el.attrib['style'] 293 elif new != old: 294 el.set('style', new) 295 for el in list(doc.iter('style')): 296 if el.get('type', '').lower().strip() == 'text/javascript': 297 el.drop_tree() 298 continue 299 old = el.text or '' 300 new = _css_javascript_re.sub('', old) 301 # The imported CSS can do anything; we just can't allow: 302 new = _css_import_re.sub('', old) 303 if self._has_sneaky_javascript(new): 304 # Something tricky is going on... 305 el.text = '/* deleted */' 306 elif new != old: 307 el.text = new 308 if self.comments or self.processing_instructions: 309 # FIXME: why either? I feel like there's some obscure reason 310 # because you can put PIs in comments...? But I've already 311 # forgotten it 312 kill_tags.add(etree.Comment) 313 if self.processing_instructions: 314 kill_tags.add(etree.ProcessingInstruction) 315 if self.style: 316 kill_tags.add('style') 317 etree.strip_attributes(doc, 'style') 318 if self.links: 319 kill_tags.add('link') 320 elif self.style or self.javascript: 321 # We must get rid of included stylesheets if Javascript is not 322 # allowed, as you can put Javascript in them 323 for el in list(doc.iter('link')): 324 if 'stylesheet' in el.get('rel', '').lower(): 325 # Note this kills alternate stylesheets as well 326 if not self.allow_element(el): 327 el.drop_tree() 328 if self.meta: 329 kill_tags.add('meta') 330 if self.page_structure: 331 remove_tags.update(('head', 'html', 'title')) 332 if self.embedded: 333 # FIXME: is <layer> really embedded? 334 # We should get rid of any <param> tags not inside <applet>; 335 # These are not really valid anyway. 336 for el in list(doc.iter('param')): 337 found_parent = False 338 parent = el.getparent() 339 while parent is not None and parent.tag not in ('applet', 'object'): 340 parent = parent.getparent() 341 if parent is None: 342 el.drop_tree() 343 kill_tags.update(('applet',)) 344 # The alternate contents that are in an iframe are a good fallback: 345 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 346 if self.frames: 347 # FIXME: ideally we should look at the frame links, but 348 # generally frames don't mix properly with an HTML 349 # fragment anyway. 350 kill_tags.update(defs.frame_tags) 351 if self.forms: 352 remove_tags.add('form') 353 kill_tags.update(('button', 'input', 'select', 'textarea')) 354 if self.annoying_tags: 355 remove_tags.update(('blink', 'marquee')) 356 357 _remove = [] 358 _kill = [] 359 for el in doc.iter(): 360 if el.tag in kill_tags: 361 if self.allow_element(el): 362 continue 363 _kill.append(el) 364 elif el.tag in remove_tags: 365 if self.allow_element(el): 366 continue 367 _remove.append(el) 368 369 if _remove and _remove[0] == doc: 370 # We have to drop the parent-most tag, which we can't 371 # do. Instead we'll rewrite it: 372 el = _remove.pop(0) 373 el.tag = 'div' 374 el.attrib.clear() 375 elif _kill and _kill[0] == doc: 376 # We have to drop the parent-most element, which we can't 377 # do. Instead we'll clear it: 378 el = _kill.pop(0) 379 if el.tag != 'html': 380 el.tag = 'div' 381 el.clear() 382 383 _kill.reverse() # start with innermost tags 384 for el in _kill: 385 el.drop_tree() 386 for el in _remove: 387 el.drop_tag() 388 389 if self.remove_unknown_tags: 390 if allow_tags: 391 raise ValueError( 392 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 393 allow_tags = set(defs.tags) 394 if allow_tags: 395 bad = [] 396 for el in doc.iter(): 397 if el.tag not in allow_tags: 398 bad.append(el) 399 if bad: 400 if bad[0] is doc: 401 el = bad.pop(0) 402 el.tag = 'div' 403 el.attrib.clear() 404 for el in bad: 405 el.drop_tag() 406 if self.add_nofollow: 407 for el in _find_external_links(doc): 408 if not self.allow_follow(el): 409 rel = el.get('rel') 410 if rel: 411 if ('nofollow' in rel 412 and ' nofollow ' in (' %s ' % rel)): 413 continue 414 rel = '%s nofollow' % rel 415 else: 416 rel = 'nofollow' 417 el.set('rel', rel)
418
419 - def allow_follow(self, anchor):
420 """ 421 Override to suppress rel="nofollow" on some anchors. 422 """ 423 return False
424
425 - def allow_element(self, el):
426 if el.tag not in self._tag_link_attrs: 427 return False 428 attr = self._tag_link_attrs[el.tag] 429 if isinstance(attr, (list, tuple)): 430 for one_attr in attr: 431 url = el.get(one_attr) 432 if not url: 433 return False 434 if not self.allow_embedded_url(el, url): 435 return False 436 return True 437 else: 438 url = el.get(attr) 439 if not url: 440 return False 441 return self.allow_embedded_url(el, url)
442
443 - def allow_embedded_url(self, el, url):
444 if (self.whitelist_tags is not None 445 and el.tag not in self.whitelist_tags): 446 return False 447 scheme, netloc, path, query, fragment = urlsplit(url) 448 netloc = netloc.lower().split(':', 1)[0] 449 if scheme not in ('http', 'https'): 450 return False 451 if netloc in self.host_whitelist: 452 return True 453 return False
454
455 - def kill_conditional_comments(self, doc):
456 """ 457 IE conditional comments basically embed HTML that the parser 458 doesn't normally see. We can't allow anything like that, so 459 we'll kill any comments that could be conditional. 460 """ 461 bad = [] 462 self._kill_elements( 463 doc, lambda el: _conditional_comment_re.search(el.text), 464 etree.Comment)
465
466 - def _kill_elements(self, doc, condition, iterate=None):
467 bad = [] 468 for el in doc.iter(iterate): 469 if condition(el): 470 bad.append(el) 471 for el in bad: 472 el.drop_tree()
473 481 482 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 483
484 - def _has_sneaky_javascript(self, style):
485 """ 486 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 487 can get interpreted, or ``expre/* stuff */ssion(...)``. This 488 checks for attempt to do stuff like this. 489 490 Typically the response will be to kill the entire style; if you 491 have just a bit of Javascript in the style another rule will catch 492 that and remove only the Javascript from the style; this catches 493 more sneaky attempts. 494 """ 495 style = self._substitute_comments('', style) 496 style = style.replace('\\', '') 497 style = _substitute_whitespace('', style) 498 style = style.lower() 499 if 'javascript:' in style: 500 return True 501 if 'expression(' in style: 502 return True 503 return False
504
505 - def clean_html(self, html):
506 result_type = type(html) 507 if isinstance(html, basestring): 508 doc = fromstring(html) 509 else: 510 doc = copy.deepcopy(html) 511 self(doc) 512 return _transform_result(result_type, doc)
513 514 clean = Cleaner() 515 clean_html = clean.clean_html 516 517 ############################################################ 518 ## Autolinking 519 ############################################################ 520 521 _link_regexes = [ 522 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 523 # This is conservative, but autolinking can be a bit conservative: 524 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 525 ] 526 527 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 528 529 _avoid_hosts = [ 530 re.compile(r'^localhost', re.I), 531 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 532 re.compile(r'^127\.0\.0\.1$'), 533 ] 534 535 _avoid_classes = ['nolink'] 536 581 639 648 649 autolink_html.__doc__ = autolink.__doc__ 650 651 ############################################################ 652 ## Word wrapping 653 ############################################################ 654 655 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 656 _avoid_word_break_classes = ['nobreak'] 657
658 -def word_break(el, max_width=40, 659 avoid_elements=_avoid_word_break_elements, 660 avoid_classes=_avoid_word_break_classes, 661 break_character=unichr(0x200b)):
662 """ 663 Breaks any long words found in the body of the text (not attributes). 664 665 Doesn't effect any of the tags in avoid_elements, by default 666 ``<textarea>`` and ``<pre>`` 667 668 Breaks words by inserting &#8203;, which is a unicode character 669 for Zero Width Space character. This generally takes up no space 670 in rendering, but does copy as a space, and in monospace contexts 671 usually takes up space. 672 673 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 674 """ 675 # Character suggestion of &#8203 comes from: 676 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 677 if el.tag in _avoid_word_break_elements: 678 return 679 class_name = el.get('class') 680 if class_name: 681 dont_break = False 682 class_name = class_name.split() 683 for avoid in avoid_classes: 684 if avoid in class_name: 685 dont_break = True 686 break 687 if dont_break: 688 return 689 if el.text: 690 el.text = _break_text(el.text, max_width, break_character) 691 for child in el: 692 word_break(child, max_width=max_width, 693 avoid_elements=avoid_elements, 694 avoid_classes=avoid_classes, 695 break_character=break_character) 696 if child.tail: 697 child.tail = _break_text(child.tail, max_width, break_character)
698
699 -def word_break_html(html, *args, **kw):
700 result_type = type(html) 701 doc = fromstring(html) 702 word_break(doc, *args, **kw) 703 return _transform_result(result_type, doc)
704
705 -def _break_text(text, max_width, break_character):
706 words = text.split() 707 for word in words: 708 if len(word) > max_width: 709 replacement = _insert_break(word, max_width, break_character) 710 text = text.replace(word, replacement) 711 return text
712 713 _break_prefer_re = re.compile(r'[^a-z]', re.I) 714
715 -def _insert_break(word, width, break_character):
716 orig_word = word 717 result = '' 718 while len(word) > width: 719 start = word[:width] 720 breaks = list(_break_prefer_re.finditer(start)) 721 if breaks: 722 last_break = breaks[-1] 723 # Only walk back up to 10 characters to find a nice break: 724 if last_break.end() > width-10: 725 # FIXME: should the break character be at the end of the 726 # chunk, or the beginning of the next chunk? 727 start = word[:last_break.end()] 728 result += start + break_character 729 word = word[len(start):] 730 result += word 731 return result
732