Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      set 
 21  except NameError: 
 22      # Python 3 
 23      from sets import Set as set 
 24   
 25  try: 
 26      unichr = __builtins__['unichr'] 
 27  except (NameError, KeyError): 
 28      # Python 3 
 29      unichr = chr 
 30   
 31  try: 
 32      unicode = __builtins__['unicode'] 
 33  except (NameError, KeyError): 
 34      # Python 3 
 35      unicode = str 
 36   
 37  try: 
 38      bytes = __builtins__['bytes'] 
 39  except (NameError, KeyError): 
 40      # Python < 2.6 
 41      bytes = str 
 42   
 43  try: 
 44      basestring = __builtins__['basestring'] 
 45  except (NameError, KeyError): 
 46      basestring = (str, bytes) 
 47   
 48   
 49  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 50             'word_break', 'word_break_html'] 
 51   
 52  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 53  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 54  # I have multiple kinds of schemes searched; but should schemes be 
 55  #   whitelisted instead? 
 56  # max height? 
 57  # remove images?  Also in CSS?  background attribute? 
 58  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 59  #   allow *just* embedded YouTube movies) 
 60  # Log what was deleted and why? 
 61  # style="behavior: ..." might be bad in IE? 
 62  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 63  #   metas. 
 64  # UTF-7 detections?  Example: 
 65  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 66  #   you don't always have to have the charset set, if the page has no charset 
 67  #   and there's UTF7-like code in it. 
 68  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 69   
 70   
 71  # This is an IE-specific construct you can have in a stylesheet to 
 72  # run some Javascript: 
 73  _css_javascript_re = re.compile( 
 74      r'expression\s*\(.*?\)', re.S|re.I) 
 75   
 76  # Do I have to worry about @\nimport? 
 77  _css_import_re = re.compile( 
 78      r'@\s*import', re.I) 
 79   
 80  # All kinds of schemes besides just javascript: that can cause 
 81  # execution: 
 82  _javascript_scheme_re = re.compile( 
 83      r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I) 
 84  _substitute_whitespace = re.compile(r'\s+').sub 
 85  # FIXME: should data: be blocked? 
 86   
 87  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 88  _conditional_comment_re = re.compile( 
 89      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 90   
 91  _find_styled_elements = etree.XPath( 
 92      "descendant-or-self::*[@style]") 
 93   
 94  _find_external_links = etree.XPath( 
 95      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 96       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 97      namespaces={'x':XHTML_NAMESPACE}) 
 98   
99 -class Cleaner(object):
100 """ 101 Instances cleans the document of each of the possible offending 102 elements. The cleaning is controlled by attributes; you can 103 override attributes in a subclass, or set them in the constructor. 104 105 ``scripts``: 106 Removes any ``<script>`` tags. 107 108 ``javascript``: 109 Removes any Javascript, like an ``onclick`` attribute. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 140 141 ``remove_tags``: 142 A list of tags to remove. Only the tags will be removed, 143 their content will get pulled up into the parent tag. 144 145 ``kill_tags``: 146 A list of tags to kill. Killing also removes the tag's content, 147 i.e. the whole subtree, not just the tag itself. 148 149 ``allow_tags``: 150 A list of tags to include (default include all). 151 152 ``remove_unknown_tags``: 153 Remove any tags that aren't standard parts of HTML. 154 155 ``safe_attrs_only``: 156 If true, only include 'safe' attributes (specifically the list 157 from `feedparser 158 <http://feedparser.org/docs/html-sanitization.html>`_). 159 160 ``add_nofollow``: 161 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 162 163 ``host_whitelist``: 164 A list or set of hosts that you can use for embedded content 165 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 166 You can also implement/override the method 167 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 168 implement more complex rules for what can be embedded. 169 Anything that passes this test will be shown, regardless of 170 the value of (for instance) ``embedded``. 171 172 Note that this parameter might not work as intended if you do not 173 make the links absolute before doing the cleaning. 174 175 ``whitelist_tags``: 176 A set of tags that can be included with ``host_whitelist``. 177 The default is ``iframe`` and ``embed``; you may wish to 178 include other tags like ``script``, or you may want to 179 implement ``allow_embedded_url`` for more control. Set to None to 180 include all tags. 181 182 This modifies the document *in place*. 183 """ 184 185 scripts = True 186 javascript = True 187 comments = True 188 style = False 189 links = True 190 meta = True 191 page_structure = True 192 processing_instructions = True 193 embedded = True 194 frames = True 195 forms = True 196 annoying_tags = True 197 remove_tags = None 198 allow_tags = None 199 kill_tags = None 200 remove_unknown_tags = True 201 safe_attrs_only = True 202 add_nofollow = False 203 host_whitelist = () 204 whitelist_tags = set(['iframe', 'embed']) 205
206 - def __init__(self, **kw):
207 for name, value in kw.items(): 208 if not hasattr(self, name): 209 raise TypeError( 210 "Unknown parameter: %s=%r" % (name, value)) 211 setattr(self, name, value)
212 213 # Used to lookup the primary URL for a given tag that is up for 214 # removal: 215 _tag_link_attrs = dict( 216 script='src', 217 link='href', 218 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 219 # From what I can tell, both attributes can contain a link: 220 applet=['code', 'object'], 221 iframe='src', 222 embed='src', 223 layer='src', 224 # FIXME: there doesn't really seem like a general way to figure out what 225 # links an <object> tag uses; links often go in <param> tags with values 226 # that we don't really know. You'd have to have knowledge about specific 227 # kinds of plugins (probably keyed off classid), and match against those. 228 ##object=?, 229 # FIXME: not looking at the action currently, because it is more complex 230 # than than -- if you keep the form, you should keep the form controls. 231 ##form='action', 232 a='href', 233 ) 234
235 - def __call__(self, doc):
236 """ 237 Cleans the document. 238 """ 239 if hasattr(doc, 'getroot'): 240 # ElementTree instance, instead of an element 241 doc = doc.getroot() 242 # convert XHTML to HTML 243 xhtml_to_html(doc) 244 # Normalize a case that IE treats <image> like <img>, and that 245 # can confuse either this step or later steps. 246 for el in doc.iter('image'): 247 el.tag = 'img' 248 if not self.comments: 249 # Of course, if we were going to kill comments anyway, we don't 250 # need to worry about this 251 self.kill_conditional_comments(doc) 252 253 kill_tags = set(self.kill_tags or ()) 254 remove_tags = set(self.remove_tags or ()) 255 allow_tags = set(self.allow_tags or ()) 256 257 if self.scripts: 258 kill_tags.add('script') 259 if self.safe_attrs_only: 260 safe_attrs = set(defs.safe_attrs) 261 for el in doc.iter(): 262 attrib = el.attrib 263 for aname in attrib.keys(): 264 if aname not in safe_attrs: 265 del attrib[aname] 266 if self.javascript: 267 if not self.safe_attrs_only: 268 # safe_attrs handles events attributes itself 269 for el in doc.iter(): 270 attrib = el.attrib 271 for aname in attrib.keys(): 272 if aname.startswith('on'): 273 del attrib[aname] 274 doc.rewrite_links(self._remove_javascript_link, 275 resolve_base_href=False) 276 if not self.style: 277 # If we're deleting style then we don't have to remove JS links 278 # from styles, otherwise... 279 for el in _find_styled_elements(doc): 280 old = el.get('style') 281 new = _css_javascript_re.sub('', old) 282 new = _css_import_re.sub('', old) 283 if self._has_sneaky_javascript(new): 284 # Something tricky is going on... 285 del el.attrib['style'] 286 elif new != old: 287 el.set('style', new) 288 for el in list(doc.iter('style')): 289 if el.get('type', '').lower().strip() == 'text/javascript': 290 el.drop_tree() 291 continue 292 old = el.text or '' 293 new = _css_javascript_re.sub('', old) 294 # The imported CSS can do anything; we just can't allow: 295 new = _css_import_re.sub('', old) 296 if self._has_sneaky_javascript(new): 297 # Something tricky is going on... 298 el.text = '/* deleted */' 299 elif new != old: 300 el.text = new 301 if self.comments or self.processing_instructions: 302 # FIXME: why either? I feel like there's some obscure reason 303 # because you can put PIs in comments...? But I've already 304 # forgotten it 305 kill_tags.add(etree.Comment) 306 if self.processing_instructions: 307 kill_tags.add(etree.ProcessingInstruction) 308 if self.style: 309 kill_tags.add('style') 310 etree.strip_attributes(doc, 'style') 311 if self.links: 312 kill_tags.add('link') 313 elif self.style or self.javascript: 314 # We must get rid of included stylesheets if Javascript is not 315 # allowed, as you can put Javascript in them 316 for el in list(doc.iter('link')): 317 if 'stylesheet' in el.get('rel', '').lower(): 318 # Note this kills alternate stylesheets as well 319 el.drop_tree() 320 if self.meta: 321 kill_tags.add('meta') 322 if self.page_structure: 323 remove_tags.update(('head', 'html', 'title')) 324 if self.embedded: 325 # FIXME: is <layer> really embedded? 326 # We should get rid of any <param> tags not inside <applet>; 327 # These are not really valid anyway. 328 for el in list(doc.iter('param')): 329 found_parent = False 330 parent = el.getparent() 331 while parent is not None and parent.tag not in ('applet', 'object'): 332 parent = parent.getparent() 333 if parent is None: 334 el.drop_tree() 335 kill_tags.update(('applet',)) 336 # The alternate contents that are in an iframe are a good fallback: 337 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 338 if self.frames: 339 # FIXME: ideally we should look at the frame links, but 340 # generally frames don't mix properly with an HTML 341 # fragment anyway. 342 kill_tags.update(defs.frame_tags) 343 if self.forms: 344 remove_tags.add('form') 345 kill_tags.update(('button', 'input', 'select', 'textarea')) 346 if self.annoying_tags: 347 remove_tags.update(('blink', 'marquee')) 348 349 _remove = [] 350 _kill = [] 351 for el in doc.iter(): 352 if el.tag in kill_tags: 353 if self.allow_element(el): 354 continue 355 _kill.append(el) 356 elif el.tag in remove_tags: 357 if self.allow_element(el): 358 continue 359 _remove.append(el) 360 361 if _remove and _remove[0] == doc: 362 # We have to drop the parent-most tag, which we can't 363 # do. Instead we'll rewrite it: 364 el = _remove.pop(0) 365 el.tag = 'div' 366 el.attrib.clear() 367 elif _kill and _kill[0] == doc: 368 # We have to drop the parent-most element, which we can't 369 # do. Instead we'll clear it: 370 el = _kill.pop(0) 371 if el.tag != 'html': 372 el.tag = 'div' 373 el.clear() 374 375 _kill.reverse() # start with innermost tags 376 for el in _kill: 377 el.drop_tree() 378 for el in _remove: 379 el.drop_tag() 380 381 allow_tags = self.allow_tags 382 if self.remove_unknown_tags: 383 if allow_tags: 384 raise ValueError( 385 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 386 allow_tags = set(defs.tags) 387 if allow_tags: 388 bad = [] 389 for el in doc.iter(): 390 if el.tag not in allow_tags: 391 bad.append(el) 392 if bad: 393 if bad[0] is doc: 394 el = bad.pop(0) 395 el.tag = 'div' 396 el.attrib.clear() 397 for el in bad: 398 el.drop_tag() 399 if self.add_nofollow: 400 for el in _find_external_links(doc): 401 if not self.allow_follow(el): 402 el.set('rel', 'nofollow')
403
404 - def allow_follow(self, anchor):
405 """ 406 Override to suppress rel="nofollow" on some anchors. 407 """ 408 return False
409
410 - def allow_element(self, el):
411 if el.tag not in self._tag_link_attrs: 412 return False 413 attr = self._tag_link_attrs[el.tag] 414 if isinstance(attr, (list, tuple)): 415 for one_attr in attr: 416 url = el.get(one_attr) 417 if not url: 418 return False 419 if not self.allow_embedded_url(el, url): 420 return False 421 return True 422 else: 423 url = el.get(attr) 424 if not url: 425 return False 426 return self.allow_embedded_url(el, url)
427
428 - def allow_embedded_url(self, el, url):
429 if (self.whitelist_tags is not None 430 and el.tag not in self.whitelist_tags): 431 return False 432 scheme, netloc, path, query, fragment = urlsplit(url) 433 netloc = netloc.lower().split(':', 1)[0] 434 if scheme not in ('http', 'https'): 435 return False 436 if netloc in self.host_whitelist: 437 return True 438 return False
439
440 - def kill_conditional_comments(self, doc):
441 """ 442 IE conditional comments basically embed HTML that the parser 443 doesn't normally see. We can't allow anything like that, so 444 we'll kill any comments that could be conditional. 445 """ 446 bad = [] 447 self._kill_elements( 448 doc, lambda el: _conditional_comment_re.search(el.text), 449 etree.Comment)
450
451 - def _kill_elements(self, doc, condition, iterate=None):
452 bad = [] 453 for el in doc.iter(iterate): 454 if condition(el): 455 bad.append(el) 456 for el in bad: 457 el.drop_tree()
458 466 467 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 468
469 - def _has_sneaky_javascript(self, style):
470 """ 471 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 472 can get interpreted, or ``expre/* stuff */ssion(...)``. This 473 checks for attempt to do stuff like this. 474 475 Typically the response will be to kill the entire style; if you 476 have just a bit of Javascript in the style another rule will catch 477 that and remove only the Javascript from the style; this catches 478 more sneaky attempts. 479 """ 480 style = self._substitute_comments('', style) 481 style = style.replace('\\', '') 482 style = _substitute_whitespace('', style) 483 style = style.lower() 484 if 'javascript:' in style: 485 return True 486 if 'expression(' in style: 487 return True 488 return False
489
490 - def clean_html(self, html):
491 result_type = type(html) 492 if isinstance(html, basestring): 493 doc = fromstring(html) 494 else: 495 doc = copy.deepcopy(html) 496 self(doc) 497 return _transform_result(result_type, doc)
498 499 clean = Cleaner() 500 clean_html = clean.clean_html 501 502 ############################################################ 503 ## Autolinking 504 ############################################################ 505 506 _link_regexes = [ 507 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 508 # This is conservative, but autolinking can be a bit conservative: 509 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 510 ] 511 512 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 513 514 _avoid_hosts = [ 515 re.compile(r'^localhost', re.I), 516 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 517 re.compile(r'^127\.0\.0\.1$'), 518 ] 519 520 _avoid_classes = ['nolink'] 521 566 624 633 634 autolink_html.__doc__ = autolink.__doc__ 635 636 ############################################################ 637 ## Word wrapping 638 ############################################################ 639 640 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 641 _avoid_word_break_classes = ['nobreak'] 642
643 -def word_break(el, max_width=40, 644 avoid_elements=_avoid_word_break_elements, 645 avoid_classes=_avoid_word_break_classes, 646 break_character=unichr(0x200b)):
647 """ 648 Breaks any long words found in the body of the text (not attributes). 649 650 Doesn't effect any of the tags in avoid_elements, by default 651 ``<textarea>`` and ``<pre>`` 652 653 Breaks words by inserting &#8203;, which is a unicode character 654 for Zero Width Space character. This generally takes up no space 655 in rendering, but does copy as a space, and in monospace contexts 656 usually takes up space. 657 658 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 659 """ 660 # Character suggestion of &#8203 comes from: 661 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 662 if el.tag in _avoid_word_break_elements: 663 return 664 class_name = el.get('class') 665 if class_name: 666 dont_break = False 667 class_name = class_name.split() 668 for avoid in avoid_classes: 669 if avoid in class_name: 670 dont_break = True 671 break 672 if dont_break: 673 return 674 if el.text: 675 el.text = _break_text(el.text, max_width, break_character) 676 for child in el: 677 word_break(child, max_width=max_width, 678 avoid_elements=avoid_elements, 679 avoid_classes=avoid_classes, 680 break_character=break_character) 681 if child.tail: 682 child.tail = _break_text(child.tail, max_width, break_character)
683
684 -def word_break_html(html, *args, **kw):
685 result_type = type(html) 686 doc = fromstring(html) 687 word_break(doc, *args, **kw) 688 return _transform_result(result_type, doc)
689
690 -def _break_text(text, max_width, break_character):
691 words = text.split() 692 for word in words: 693 if len(word) > max_width: 694 replacement = _insert_break(word, max_width, break_character) 695 text = text.replace(word, replacement) 696 return text
697 698 _break_prefer_re = re.compile(r'[^a-z]', re.I) 699
700 -def _insert_break(word, width, break_character):
701 orig_word = word 702 result = '' 703 while len(word) > width: 704 start = word[:width] 705 breaks = list(_break_prefer_re.finditer(start)) 706 if breaks: 707 last_break = breaks[-1] 708 # Only walk back up to 10 characters to find a nice break: 709 if last_break.end() > width-10: 710 # FIXME: should the break character be at the end of the 711 # chunk, or the beginning of the next chunk? 712 start = word[:last_break.end()] 713 result += start + break_character 714 word = word[len(start):] 715 result += word 716 return result
717