Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import _nons, _transform_result 
 18   
 19  try: 
 20      set 
 21  except NameError: 
 22      # Python 3 
 23      from sets import Set as set 
 24   
 25  try: 
 26      unichr = __builtins__['unichr'] 
 27  except (NameError, KeyError): 
 28      # Python 3 
 29      unichr = chr 
 30   
 31  try: 
 32      unicode = __builtins__['unicode'] 
 33  except (NameError, KeyError): 
 34      # Python 3 
 35      unicode = str 
 36   
 37  try: 
 38      bytes = __builtins__['bytes'] 
 39  except (NameError, KeyError): 
 40      # Python < 2.6 
 41      bytes = str 
 42   
 43  try: 
 44      basestring = __builtins__['basestring'] 
 45  except (NameError, KeyError): 
 46      basestring = (str, bytes) 
 47   
 48   
 49  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 50             'word_break', 'word_break_html'] 
 51   
 52  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 53  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 54  # I have multiple kinds of schemes searched; but should schemes be 
 55  #   whitelisted instead? 
 56  # max height? 
 57  # remove images?  Also in CSS?  background attribute? 
 58  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 59  #   allow *just* embedded YouTube movies) 
 60  # Log what was deleted and why? 
 61  # style="behavior: ..." might be bad in IE? 
 62  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 63  #   metas. 
 64  # UTF-7 detections?  Example: 
 65  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 66  #   you don't always have to have the charset set, if the page has no charset 
 67  #   and there's UTF7-like code in it. 
 68  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 69   
 70   
 71  # This is an IE-specific construct you can have in a stylesheet to 
 72  # run some Javascript: 
 73  _css_javascript_re = re.compile( 
 74      r'expression\s*\(.*?\)', re.S|re.I) 
 75   
 76  # Do I have to worry about @\nimport? 
 77  _css_import_re = re.compile( 
 78      r'@\s*import', re.I) 
 79   
 80  # All kinds of schemes besides just javascript: that can cause 
 81  # execution: 
 82  _javascript_scheme_re = re.compile( 
 83      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 84  _substitute_whitespace = re.compile(r'\s+').sub 
 85  # FIXME: should data: be blocked? 
 86   
 87  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 88  _conditional_comment_re = re.compile( 
 89      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 90   
 91  _find_styled_elements = etree.XPath( 
 92      "descendant-or-self::*[@style]") 
 93   
 94  _find_external_links = etree.XPath( 
 95      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 96       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 97      namespaces={'x':XHTML_NAMESPACE}) 
 98   
99 -class Cleaner(object):
100 """ 101 Instances cleans the document of each of the possible offending 102 elements. The cleaning is controlled by attributes; you can 103 override attributes in a subclass, or set them in the constructor. 104 105 ``scripts``: 106 Removes any ``<script>`` tags. 107 108 ``javascript``: 109 Removes any Javascript, like an ``onclick`` attribute. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 140 141 ``remove_tags``: 142 A list of tags to remove. 143 144 ``allow_tags``: 145 A list of tags to include (default include all). 146 147 ``remove_unknown_tags``: 148 Remove any tags that aren't standard parts of HTML. 149 150 ``safe_attrs_only``: 151 If true, only include 'safe' attributes (specifically the list 152 from `feedparser 153 <http://feedparser.org/docs/html-sanitization.html>`_). 154 155 ``add_nofollow``: 156 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 157 158 ``host_whitelist``: 159 A list or set of hosts that you can use for embedded content 160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 161 You can also implement/override the method 162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 163 implement more complex rules for what can be embedded. 164 Anything that passes this test will be shown, regardless of 165 the value of (for instance) ``embedded``. 166 167 Note that this parameter might not work as intended if you do not 168 make the links absolute before doing the cleaning. 169 170 ``whitelist_tags``: 171 A set of tags that can be included with ``host_whitelist``. 172 The default is ``iframe`` and ``embed``; you may wish to 173 include other tags like ``script``, or you may want to 174 implement ``allow_embedded_url`` for more control. Set to None to 175 include all tags. 176 177 This modifies the document *in place*. 178 """ 179 180 scripts = True 181 javascript = True 182 comments = True 183 style = False 184 links = True 185 meta = True 186 page_structure = True 187 processing_instructions = True 188 embedded = True 189 frames = True 190 forms = True 191 annoying_tags = True 192 remove_tags = None 193 allow_tags = None 194 remove_unknown_tags = True 195 safe_attrs_only = True 196 add_nofollow = False 197 host_whitelist = () 198 whitelist_tags = set(['iframe', 'embed']) 199
200 - def __init__(self, **kw):
201 for name, value in kw.items(): 202 if not hasattr(self, name): 203 raise TypeError( 204 "Unknown parameter: %s=%r" % (name, value)) 205 setattr(self, name, value)
206 207 # Used to lookup the primary URL for a given tag that is up for 208 # removal: 209 _tag_link_attrs = dict( 210 script='src', 211 link='href', 212 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 213 # From what I can tell, both attributes can contain a link: 214 applet=['code', 'object'], 215 iframe='src', 216 embed='src', 217 layer='src', 218 # FIXME: there doesn't really seem like a general way to figure out what 219 # links an <object> tag uses; links often go in <param> tags with values 220 # that we don't really know. You'd have to have knowledge about specific 221 # kinds of plugins (probably keyed off classid), and match against those. 222 ##object=?, 223 # FIXME: not looking at the action currently, because it is more complex 224 # than than -- if you keep the form, you should keep the form controls. 225 ##form='action', 226 a='href', 227 ) 228
229 - def __call__(self, doc):
230 """ 231 Cleans the document. 232 """ 233 if hasattr(doc, 'getroot'): 234 # ElementTree instance, instead of an element 235 doc = doc.getroot() 236 # convert XHTML to HTML 237 for el in doc.iter(): 238 tag = el.tag 239 if isinstance(tag, basestring): 240 el.tag = _nons(tag) 241 # Normalize a case that IE treats <image> like <img>, and that 242 # can confuse either this step or later steps. 243 for el in doc.iter('image'): 244 el.tag = 'img' 245 if not self.comments: 246 # Of course, if we were going to kill comments anyway, we don't 247 # need to worry about this 248 self.kill_conditional_comments(doc) 249 kill_tags = set() 250 remove_tags = set(self.remove_tags or ()) 251 if self.allow_tags: 252 allow_tags = set(self.allow_tags) 253 else: 254 allow_tags = set() 255 if self.scripts: 256 kill_tags.add('script') 257 if self.safe_attrs_only: 258 safe_attrs = set(defs.safe_attrs) 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname not in safe_attrs: 263 del attrib[aname] 264 if self.javascript: 265 if not self.safe_attrs_only: 266 # safe_attrs handles events attributes itself 267 for el in doc.iter(): 268 attrib = el.attrib 269 for aname in attrib.keys(): 270 if aname.startswith('on'): 271 del attrib[aname] 272 doc.rewrite_links(self._remove_javascript_link, 273 resolve_base_href=False) 274 if not self.style: 275 # If we're deleting style then we don't have to remove JS links 276 # from styles, otherwise... 277 for el in _find_styled_elements(doc): 278 old = el.get('style') 279 new = _css_javascript_re.sub('', old) 280 new = _css_import_re.sub('', old) 281 if self._has_sneaky_javascript(new): 282 # Something tricky is going on... 283 del el.attrib['style'] 284 elif new != old: 285 el.set('style', new) 286 for el in list(doc.iter('style')): 287 if el.get('type', '').lower().strip() == 'text/javascript': 288 el.drop_tree() 289 continue 290 old = el.text or '' 291 new = _css_javascript_re.sub('', old) 292 # The imported CSS can do anything; we just can't allow: 293 new = _css_import_re.sub('', old) 294 if self._has_sneaky_javascript(new): 295 # Something tricky is going on... 296 el.text = '/* deleted */' 297 elif new != old: 298 el.text = new 299 if self.comments or self.processing_instructions: 300 # FIXME: why either? I feel like there's some obscure reason 301 # because you can put PIs in comments...? But I've already 302 # forgotten it 303 kill_tags.add(etree.Comment) 304 if self.processing_instructions: 305 kill_tags.add(etree.ProcessingInstruction) 306 if self.style: 307 kill_tags.add('style') 308 etree.strip_attributes(doc, 'style') 309 if self.links: 310 kill_tags.add('link') 311 elif self.style or self.javascript: 312 # We must get rid of included stylesheets if Javascript is not 313 # allowed, as you can put Javascript in them 314 for el in list(doc.iter('link')): 315 if 'stylesheet' in el.get('rel', '').lower(): 316 # Note this kills alternate stylesheets as well 317 el.drop_tree() 318 if self.meta: 319 kill_tags.add('meta') 320 if self.page_structure: 321 remove_tags.update(('head', 'html', 'title')) 322 if self.embedded: 323 # FIXME: is <layer> really embedded? 324 # We should get rid of any <param> tags not inside <applet>; 325 # These are not really valid anyway. 326 for el in list(doc.iter('param')): 327 found_parent = False 328 parent = el.getparent() 329 while parent is not None and parent.tag not in ('applet', 'object'): 330 parent = parent.getparent() 331 if parent is None: 332 el.drop_tree() 333 kill_tags.update(('applet',)) 334 # The alternate contents that are in an iframe are a good fallback: 335 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 336 if self.frames: 337 # FIXME: ideally we should look at the frame links, but 338 # generally frames don't mix properly with an HTML 339 # fragment anyway. 340 kill_tags.update(defs.frame_tags) 341 if self.forms: 342 remove_tags.add('form') 343 kill_tags.update(('button', 'input', 'select', 'textarea')) 344 if self.annoying_tags: 345 remove_tags.update(('blink', 'marque')) 346 347 _remove = [] 348 _kill = [] 349 for el in doc.iter(): 350 if el.tag in kill_tags: 351 if self.allow_element(el): 352 continue 353 _kill.append(el) 354 elif el.tag in remove_tags: 355 if self.allow_element(el): 356 continue 357 _remove.append(el) 358 359 if _remove and _remove[0] == doc: 360 # We have to drop the parent-most tag, which we can't 361 # do. Instead we'll rewrite it: 362 el = _remove.pop(0) 363 el.tag = 'div' 364 el.attrib.clear() 365 elif _kill and _kill[0] == doc: 366 # We have to drop the parent-most element, which we can't 367 # do. Instead we'll clear it: 368 el = _kill.pop(0) 369 if el.tag != 'html': 370 el.tag = 'div' 371 el.clear() 372 373 for el in _kill: 374 el.drop_tree() 375 for el in _remove: 376 el.drop_tag() 377 378 allow_tags = self.allow_tags 379 if self.remove_unknown_tags: 380 if allow_tags: 381 raise ValueError( 382 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 383 allow_tags = set(defs.tags) 384 if allow_tags: 385 bad = [] 386 for el in doc.iter(): 387 if el.tag not in allow_tags: 388 bad.append(el) 389 for el in bad: 390 el.drop_tag() 391 if self.add_nofollow: 392 for el in _find_external_links(doc): 393 if not self.allow_follow(el): 394 el.set('rel', 'nofollow')
395
396 - def allow_follow(self, anchor):
397 """ 398 Override to suppress rel="nofollow" on some anchors. 399 """ 400 return False
401
402 - def allow_element(self, el):
403 if el.tag not in self._tag_link_attrs: 404 return False 405 attr = self._tag_link_attrs[el.tag] 406 if isinstance(attr, (list, tuple)): 407 for one_attr in attr: 408 url = el.get(one_attr) 409 if not url: 410 return False 411 if not self.allow_embedded_url(el, url): 412 return False 413 return True 414 else: 415 url = el.get(attr) 416 if not url: 417 return False 418 return self.allow_embedded_url(el, url)
419
420 - def allow_embedded_url(self, el, url):
421 if (self.whitelist_tags is not None 422 and el.tag not in self.whitelist_tags): 423 return False 424 scheme, netloc, path, query, fragment = urlsplit(url) 425 netloc = netloc.lower().split(':', 1)[0] 426 if scheme not in ('http', 'https'): 427 return False 428 if netloc in self.host_whitelist: 429 return True 430 return False
431
432 - def kill_conditional_comments(self, doc):
433 """ 434 IE conditional comments basically embed HTML that the parser 435 doesn't normally see. We can't allow anything like that, so 436 we'll kill any comments that could be conditional. 437 """ 438 bad = [] 439 self._kill_elements( 440 doc, lambda el: _conditional_comment_re.search(el.text), 441 etree.Comment)
442
443 - def _kill_elements(self, doc, condition, iterate=None):
444 bad = [] 445 for el in doc.iter(iterate): 446 if condition(el): 447 bad.append(el) 448 for el in bad: 449 el.drop_tree()
450 458 459 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 460
461 - def _has_sneaky_javascript(self, style):
462 """ 463 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 464 can get interpreted, or ``expre/* stuff */ssion(...)``. This 465 checks for attempt to do stuff like this. 466 467 Typically the response will be to kill the entire style; if you 468 have just a bit of Javascript in the style another rule will catch 469 that and remove only the Javascript from the style; this catches 470 more sneaky attempts. 471 """ 472 style = self._substitute_comments('', style) 473 style = style.replace('\\', '') 474 style = _substitute_whitespace('', style) 475 style = style.lower() 476 if 'javascript:' in style: 477 return True 478 if 'expression(' in style: 479 return True 480 return False
481
482 - def clean_html(self, html):
483 result_type = type(html) 484 if isinstance(html, basestring): 485 doc = fromstring(html) 486 else: 487 doc = copy.deepcopy(html) 488 self(doc) 489 return _transform_result(result_type, doc)
490 491 clean = Cleaner() 492 clean_html = clean.clean_html 493 494 ############################################################ 495 ## Autolinking 496 ############################################################ 497 498 _link_regexes = [ 499 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 500 # This is conservative, but autolinking can be a bit conservative: 501 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 502 ] 503 504 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 505 506 _avoid_hosts = [ 507 re.compile(r'^localhost', re.I), 508 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 509 re.compile(r'^127\.0\.0\.1$'), 510 ] 511 512 _avoid_classes = ['nolink'] 513 558 616 625 626 autolink_html.__doc__ = autolink.__doc__ 627 628 ############################################################ 629 ## Word wrapping 630 ############################################################ 631 632 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 633 _avoid_word_break_classes = ['nobreak'] 634
635 -def word_break(el, max_width=40, 636 avoid_elements=_avoid_word_break_elements, 637 avoid_classes=_avoid_word_break_classes, 638 break_character=unichr(0x200b)):
639 """ 640 Breaks any long words found in the body of the text (not attributes). 641 642 Doesn't effect any of the tags in avoid_elements, by default 643 ``<textarea>`` and ``<pre>`` 644 645 Breaks words by inserting &#8203;, which is a unicode character 646 for Zero Width Space character. This generally takes up no space 647 in rendering, but does copy as a space, and in monospace contexts 648 usually takes up space. 649 650 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 651 """ 652 # Character suggestion of &#8203 comes from: 653 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 654 if el.tag in _avoid_word_break_elements: 655 return 656 class_name = el.get('class') 657 if class_name: 658 dont_break = False 659 class_name = class_name.split() 660 for avoid in avoid_classes: 661 if avoid in class_name: 662 dont_break = True 663 break 664 if dont_break: 665 return 666 if el.text: 667 el.text = _break_text(el.text, max_width, break_character) 668 for child in el: 669 word_break(child, max_width=max_width, 670 avoid_elements=avoid_elements, 671 avoid_classes=avoid_classes, 672 break_character=break_character) 673 if child.tail: 674 child.tail = _break_text(child.tail, max_width, break_character)
675
676 -def word_break_html(html, *args, **kw):
677 result_type = type(html) 678 doc = fromstring(html) 679 word_break(doc, *args, **kw) 680 return _transform_result(result_type, doc)
681
682 -def _break_text(text, max_width, break_character):
683 words = text.split() 684 for word in words: 685 if len(word) > max_width: 686 replacement = _insert_break(word, max_width, break_character) 687 text = text.replace(word, replacement) 688 return text
689 690 _break_prefer_re = re.compile(r'[^a-z]', re.I) 691
692 -def _insert_break(word, width, break_character):
693 orig_word = word 694 result = '' 695 while len(word) > width: 696 start = word[:width] 697 breaks = list(_break_prefer_re.finditer(start)) 698 if breaks: 699 last_break = breaks[-1] 700 # Only walk back up to 10 characters to find a nice break: 701 if last_break.end() > width-10: 702 # FIXME: should the break character be at the end of the 703 # chunk, or the beginning of the next chunk? 704 start = word[:last_break.end()] 705 result += start + break_character 706 word = word[len(start):] 707 result += word 708 return result
709