Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22      # Python 3 
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27      # Python 3 
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32      # Python < 2.6 
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 44  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 45  # I have multiple kinds of schemes searched; but should schemes be 
 46  #   whitelisted instead? 
 47  # max height? 
 48  # remove images?  Also in CSS?  background attribute? 
 49  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 50  #   allow *just* embedded YouTube movies) 
 51  # Log what was deleted and why? 
 52  # style="behavior: ..." might be bad in IE? 
 53  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 54  #   metas. 
 55  # UTF-7 detections?  Example: 
 56  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 57  #   you don't always have to have the charset set, if the page has no charset 
 58  #   and there's UTF7-like code in it. 
 59  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 60   
 61   
 62  # This is an IE-specific construct you can have in a stylesheet to 
 63  # run some Javascript: 
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67  # Do I have to worry about @\nimport? 
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71  # All kinds of schemes besides just javascript: that can cause 
 72  # execution: 
 73  _javascript_scheme_re = re.compile( 
 74      r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I) 
 75  _substitute_whitespace = re.compile(r'\s+').sub 
 76  # FIXME: should data: be blocked? 
 77   
 78  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 79  _conditional_comment_re = re.compile( 
 80      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 81   
 82  _find_styled_elements = etree.XPath( 
 83      "descendant-or-self::*[@style]") 
 84   
 85  _find_external_links = etree.XPath( 
 86      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 87       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 88      namespaces={'x':XHTML_NAMESPACE}) 
 89   
90 -class Cleaner(object):
91 """ 92 Instances cleans the document of each of the possible offending 93 elements. The cleaning is controlled by attributes; you can 94 override attributes in a subclass, or set them in the constructor. 95 96 ``scripts``: 97 Removes any ``<script>`` tags. 98 99 ``javascript``: 100 Removes any Javascript, like an ``onclick`` attribute. 101 102 ``comments``: 103 Removes any comments. 104 105 ``style``: 106 Removes any style tags or attributes. 107 108 ``links``: 109 Removes any ``<link>`` tags 110 111 ``meta``: 112 Removes any ``<meta>`` tags 113 114 ``page_structure``: 115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 116 117 ``processing_instructions``: 118 Removes any processing instructions. 119 120 ``embedded``: 121 Removes any embedded objects (flash, iframes) 122 123 ``frames``: 124 Removes any frame-related tags 125 126 ``forms``: 127 Removes any form tags 128 129 ``annoying_tags``: 130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 131 132 ``remove_tags``: 133 A list of tags to remove. Only the tags will be removed, 134 their content will get pulled up into the parent tag. 135 136 ``kill_tags``: 137 A list of tags to kill. Killing also removes the tag's content, 138 i.e. the whole subtree, not just the tag itself. 139 140 ``allow_tags``: 141 A list of tags to include (default include all). 142 143 ``remove_unknown_tags``: 144 Remove any tags that aren't standard parts of HTML. 145 146 ``safe_attrs_only``: 147 If true, only include 'safe' attributes (specifically the list 148 from the feedparser HTML sanitisation web site). 149 150 ``add_nofollow``: 151 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 152 153 ``host_whitelist``: 154 A list or set of hosts that you can use for embedded content 155 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 156 You can also implement/override the method 157 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 158 implement more complex rules for what can be embedded. 159 Anything that passes this test will be shown, regardless of 160 the value of (for instance) ``embedded``. 161 162 Note that this parameter might not work as intended if you do not 163 make the links absolute before doing the cleaning. 164 165 ``whitelist_tags``: 166 A set of tags that can be included with ``host_whitelist``. 167 The default is ``iframe`` and ``embed``; you may wish to 168 include other tags like ``script``, or you may want to 169 implement ``allow_embedded_url`` for more control. Set to None to 170 include all tags. 171 172 This modifies the document *in place*. 173 """ 174 175 scripts = True 176 javascript = True 177 comments = True 178 style = False 179 links = True 180 meta = True 181 page_structure = True 182 processing_instructions = True 183 embedded = True 184 frames = True 185 forms = True 186 annoying_tags = True 187 remove_tags = None 188 allow_tags = None 189 kill_tags = None 190 remove_unknown_tags = True 191 safe_attrs_only = True 192 add_nofollow = False 193 host_whitelist = () 194 whitelist_tags = set(['iframe', 'embed']) 195
196 - def __init__(self, **kw):
197 for name, value in kw.items(): 198 if not hasattr(self, name): 199 raise TypeError( 200 "Unknown parameter: %s=%r" % (name, value)) 201 setattr(self, name, value)
202 203 # Used to lookup the primary URL for a given tag that is up for 204 # removal: 205 _tag_link_attrs = dict( 206 script='src', 207 link='href', 208 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 209 # From what I can tell, both attributes can contain a link: 210 applet=['code', 'object'], 211 iframe='src', 212 embed='src', 213 layer='src', 214 # FIXME: there doesn't really seem like a general way to figure out what 215 # links an <object> tag uses; links often go in <param> tags with values 216 # that we don't really know. You'd have to have knowledge about specific 217 # kinds of plugins (probably keyed off classid), and match against those. 218 ##object=?, 219 # FIXME: not looking at the action currently, because it is more complex 220 # than than -- if you keep the form, you should keep the form controls. 221 ##form='action', 222 a='href', 223 ) 224
225 - def __call__(self, doc):
226 """ 227 Cleans the document. 228 """ 229 if hasattr(doc, 'getroot'): 230 # ElementTree instance, instead of an element 231 doc = doc.getroot() 232 # convert XHTML to HTML 233 xhtml_to_html(doc) 234 # Normalize a case that IE treats <image> like <img>, and that 235 # can confuse either this step or later steps. 236 for el in doc.iter('image'): 237 el.tag = 'img' 238 if not self.comments: 239 # Of course, if we were going to kill comments anyway, we don't 240 # need to worry about this 241 self.kill_conditional_comments(doc) 242 243 kill_tags = set(self.kill_tags or ()) 244 remove_tags = set(self.remove_tags or ()) 245 allow_tags = set(self.allow_tags or ()) 246 247 if self.scripts: 248 kill_tags.add('script') 249 if self.safe_attrs_only: 250 safe_attrs = set(defs.safe_attrs) 251 for el in doc.iter(): 252 attrib = el.attrib 253 for aname in attrib.keys(): 254 if aname not in safe_attrs: 255 del attrib[aname] 256 if self.javascript: 257 if not self.safe_attrs_only: 258 # safe_attrs handles events attributes itself 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname.startswith('on'): 263 del attrib[aname] 264 doc.rewrite_links(self._remove_javascript_link, 265 resolve_base_href=False) 266 if not self.style: 267 # If we're deleting style then we don't have to remove JS links 268 # from styles, otherwise... 269 for el in _find_styled_elements(doc): 270 old = el.get('style') 271 new = _css_javascript_re.sub('', old) 272 new = _css_import_re.sub('', old) 273 if self._has_sneaky_javascript(new): 274 # Something tricky is going on... 275 del el.attrib['style'] 276 elif new != old: 277 el.set('style', new) 278 for el in list(doc.iter('style')): 279 if el.get('type', '').lower().strip() == 'text/javascript': 280 el.drop_tree() 281 continue 282 old = el.text or '' 283 new = _css_javascript_re.sub('', old) 284 # The imported CSS can do anything; we just can't allow: 285 new = _css_import_re.sub('', old) 286 if self._has_sneaky_javascript(new): 287 # Something tricky is going on... 288 el.text = '/* deleted */' 289 elif new != old: 290 el.text = new 291 if self.comments or self.processing_instructions: 292 # FIXME: why either? I feel like there's some obscure reason 293 # because you can put PIs in comments...? But I've already 294 # forgotten it 295 kill_tags.add(etree.Comment) 296 if self.processing_instructions: 297 kill_tags.add(etree.ProcessingInstruction) 298 if self.style: 299 kill_tags.add('style') 300 etree.strip_attributes(doc, 'style') 301 if self.links: 302 kill_tags.add('link') 303 elif self.style or self.javascript: 304 # We must get rid of included stylesheets if Javascript is not 305 # allowed, as you can put Javascript in them 306 for el in list(doc.iter('link')): 307 if 'stylesheet' in el.get('rel', '').lower(): 308 # Note this kills alternate stylesheets as well 309 el.drop_tree() 310 if self.meta: 311 kill_tags.add('meta') 312 if self.page_structure: 313 remove_tags.update(('head', 'html', 'title')) 314 if self.embedded: 315 # FIXME: is <layer> really embedded? 316 # We should get rid of any <param> tags not inside <applet>; 317 # These are not really valid anyway. 318 for el in list(doc.iter('param')): 319 found_parent = False 320 parent = el.getparent() 321 while parent is not None and parent.tag not in ('applet', 'object'): 322 parent = parent.getparent() 323 if parent is None: 324 el.drop_tree() 325 kill_tags.update(('applet',)) 326 # The alternate contents that are in an iframe are a good fallback: 327 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 328 if self.frames: 329 # FIXME: ideally we should look at the frame links, but 330 # generally frames don't mix properly with an HTML 331 # fragment anyway. 332 kill_tags.update(defs.frame_tags) 333 if self.forms: 334 remove_tags.add('form') 335 kill_tags.update(('button', 'input', 'select', 'textarea')) 336 if self.annoying_tags: 337 remove_tags.update(('blink', 'marquee')) 338 339 _remove = [] 340 _kill = [] 341 for el in doc.iter(): 342 if el.tag in kill_tags: 343 if self.allow_element(el): 344 continue 345 _kill.append(el) 346 elif el.tag in remove_tags: 347 if self.allow_element(el): 348 continue 349 _remove.append(el) 350 351 if _remove and _remove[0] == doc: 352 # We have to drop the parent-most tag, which we can't 353 # do. Instead we'll rewrite it: 354 el = _remove.pop(0) 355 el.tag = 'div' 356 el.attrib.clear() 357 elif _kill and _kill[0] == doc: 358 # We have to drop the parent-most element, which we can't 359 # do. Instead we'll clear it: 360 el = _kill.pop(0) 361 if el.tag != 'html': 362 el.tag = 'div' 363 el.clear() 364 365 _kill.reverse() # start with innermost tags 366 for el in _kill: 367 el.drop_tree() 368 for el in _remove: 369 el.drop_tag() 370 371 allow_tags = self.allow_tags 372 if self.remove_unknown_tags: 373 if allow_tags: 374 raise ValueError( 375 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 376 allow_tags = set(defs.tags) 377 if allow_tags: 378 bad = [] 379 for el in doc.iter(): 380 if el.tag not in allow_tags: 381 bad.append(el) 382 if bad: 383 if bad[0] is doc: 384 el = bad.pop(0) 385 el.tag = 'div' 386 el.attrib.clear() 387 for el in bad: 388 el.drop_tag() 389 if self.add_nofollow: 390 for el in _find_external_links(doc): 391 if not self.allow_follow(el): 392 el.set('rel', 'nofollow')
393
394 - def allow_follow(self, anchor):
395 """ 396 Override to suppress rel="nofollow" on some anchors. 397 """ 398 return False
399
400 - def allow_element(self, el):
401 if el.tag not in self._tag_link_attrs: 402 return False 403 attr = self._tag_link_attrs[el.tag] 404 if isinstance(attr, (list, tuple)): 405 for one_attr in attr: 406 url = el.get(one_attr) 407 if not url: 408 return False 409 if not self.allow_embedded_url(el, url): 410 return False 411 return True 412 else: 413 url = el.get(attr) 414 if not url: 415 return False 416 return self.allow_embedded_url(el, url)
417
418 - def allow_embedded_url(self, el, url):
419 if (self.whitelist_tags is not None 420 and el.tag not in self.whitelist_tags): 421 return False 422 scheme, netloc, path, query, fragment = urlsplit(url) 423 netloc = netloc.lower().split(':', 1)[0] 424 if scheme not in ('http', 'https'): 425 return False 426 if netloc in self.host_whitelist: 427 return True 428 return False
429
430 - def kill_conditional_comments(self, doc):
431 """ 432 IE conditional comments basically embed HTML that the parser 433 doesn't normally see. We can't allow anything like that, so 434 we'll kill any comments that could be conditional. 435 """ 436 bad = [] 437 self._kill_elements( 438 doc, lambda el: _conditional_comment_re.search(el.text), 439 etree.Comment)
440
441 - def _kill_elements(self, doc, condition, iterate=None):
442 bad = [] 443 for el in doc.iter(iterate): 444 if condition(el): 445 bad.append(el) 446 for el in bad: 447 el.drop_tree()
448 456 457 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 458
459 - def _has_sneaky_javascript(self, style):
460 """ 461 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 462 can get interpreted, or ``expre/* stuff */ssion(...)``. This 463 checks for attempt to do stuff like this. 464 465 Typically the response will be to kill the entire style; if you 466 have just a bit of Javascript in the style another rule will catch 467 that and remove only the Javascript from the style; this catches 468 more sneaky attempts. 469 """ 470 style = self._substitute_comments('', style) 471 style = style.replace('\\', '') 472 style = _substitute_whitespace('', style) 473 style = style.lower() 474 if 'javascript:' in style: 475 return True 476 if 'expression(' in style: 477 return True 478 return False
479
480 - def clean_html(self, html):
481 result_type = type(html) 482 if isinstance(html, basestring): 483 doc = fromstring(html) 484 else: 485 doc = copy.deepcopy(html) 486 self(doc) 487 return _transform_result(result_type, doc)
488 489 clean = Cleaner() 490 clean_html = clean.clean_html 491 492 ############################################################ 493 ## Autolinking 494 ############################################################ 495 496 _link_regexes = [ 497 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 498 # This is conservative, but autolinking can be a bit conservative: 499 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 500 ] 501 502 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 503 504 _avoid_hosts = [ 505 re.compile(r'^localhost', re.I), 506 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 507 re.compile(r'^127\.0\.0\.1$'), 508 ] 509 510 _avoid_classes = ['nolink'] 511 556 614 623 624 autolink_html.__doc__ = autolink.__doc__ 625 626 ############################################################ 627 ## Word wrapping 628 ############################################################ 629 630 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 631 _avoid_word_break_classes = ['nobreak'] 632
633 -def word_break(el, max_width=40, 634 avoid_elements=_avoid_word_break_elements, 635 avoid_classes=_avoid_word_break_classes, 636 break_character=unichr(0x200b)):
637 """ 638 Breaks any long words found in the body of the text (not attributes). 639 640 Doesn't effect any of the tags in avoid_elements, by default 641 ``<textarea>`` and ``<pre>`` 642 643 Breaks words by inserting &#8203;, which is a unicode character 644 for Zero Width Space character. This generally takes up no space 645 in rendering, but does copy as a space, and in monospace contexts 646 usually takes up space. 647 648 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 649 """ 650 # Character suggestion of &#8203 comes from: 651 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 652 if el.tag in _avoid_word_break_elements: 653 return 654 class_name = el.get('class') 655 if class_name: 656 dont_break = False 657 class_name = class_name.split() 658 for avoid in avoid_classes: 659 if avoid in class_name: 660 dont_break = True 661 break 662 if dont_break: 663 return 664 if el.text: 665 el.text = _break_text(el.text, max_width, break_character) 666 for child in el: 667 word_break(child, max_width=max_width, 668 avoid_elements=avoid_elements, 669 avoid_classes=avoid_classes, 670 break_character=break_character) 671 if child.tail: 672 child.tail = _break_text(child.tail, max_width, break_character)
673
674 -def word_break_html(html, *args, **kw):
675 result_type = type(html) 676 doc = fromstring(html) 677 word_break(doc, *args, **kw) 678 return _transform_result(result_type, doc)
679
680 -def _break_text(text, max_width, break_character):
681 words = text.split() 682 for word in words: 683 if len(word) > max_width: 684 replacement = _insert_break(word, max_width, break_character) 685 text = text.replace(word, replacement) 686 return text
687 688 _break_prefer_re = re.compile(r'[^a-z]', re.I) 689
690 -def _insert_break(word, width, break_character):
691 orig_word = word 692 result = '' 693 while len(word) > width: 694 start = word[:width] 695 breaks = list(_break_prefer_re.finditer(start)) 696 if breaks: 697 last_break = breaks[-1] 698 # Only walk back up to 10 characters to find a nice break: 699 if last_break.end() > width-10: 700 # FIXME: should the break character be at the end of the 701 # chunk, or the beginning of the next chunk? 702 start = word[:last_break.end()] 703 result += start + break_character 704 word = word[len(start):] 705 result += word 706 return result
707