Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22      # Python 3 
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27      # Python 3 
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32      # Python < 2.6 
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 44  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 45  # I have multiple kinds of schemes searched; but should schemes be 
 46  #   whitelisted instead? 
 47  # max height? 
 48  # remove images?  Also in CSS?  background attribute? 
 49  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 50  #   allow *just* embedded YouTube movies) 
 51  # Log what was deleted and why? 
 52  # style="behavior: ..." might be bad in IE? 
 53  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 54  #   metas. 
 55  # UTF-7 detections?  Example: 
 56  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 57  #   you don't always have to have the charset set, if the page has no charset 
 58  #   and there's UTF7-like code in it. 
 59  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 60   
 61   
 62  # This is an IE-specific construct you can have in a stylesheet to 
 63  # run some Javascript: 
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67  # Do I have to worry about @\nimport? 
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71  # All kinds of schemes besides just javascript: that can cause 
 72  # execution: 
 73  _javascript_scheme_re = re.compile( 
 74      r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I) 
 75  _substitute_whitespace = re.compile(r'\s+').sub 
 76  # FIXME: should data: be blocked? 
 77   
 78  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 79  _conditional_comment_re = re.compile( 
 80      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 81   
 82  _find_styled_elements = etree.XPath( 
 83      "descendant-or-self::*[@style]") 
 84   
 85  _find_external_links = etree.XPath( 
 86      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 87       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 88      namespaces={'x':XHTML_NAMESPACE}) 
 89   
90 -class Cleaner(object):
91 """ 92 Instances cleans the document of each of the possible offending 93 elements. The cleaning is controlled by attributes; you can 94 override attributes in a subclass, or set them in the constructor. 95 96 ``scripts``: 97 Removes any ``<script>`` tags. 98 99 ``javascript``: 100 Removes any Javascript, like an ``onclick`` attribute. 101 102 ``comments``: 103 Removes any comments. 104 105 ``style``: 106 Removes any style tags or attributes. 107 108 ``links``: 109 Removes any ``<link>`` tags 110 111 ``meta``: 112 Removes any ``<meta>`` tags 113 114 ``page_structure``: 115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 116 117 ``processing_instructions``: 118 Removes any processing instructions. 119 120 ``embedded``: 121 Removes any embedded objects (flash, iframes) 122 123 ``frames``: 124 Removes any frame-related tags 125 126 ``forms``: 127 Removes any form tags 128 129 ``annoying_tags``: 130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 131 132 ``remove_tags``: 133 A list of tags to remove. Only the tags will be removed, 134 their content will get pulled up into the parent tag. 135 136 ``kill_tags``: 137 A list of tags to kill. Killing also removes the tag's content, 138 i.e. the whole subtree, not just the tag itself. 139 140 ``allow_tags``: 141 A list of tags to include (default include all). 142 143 ``remove_unknown_tags``: 144 Remove any tags that aren't standard parts of HTML. 145 146 ``safe_attrs_only``: 147 If true, only include 'safe' attributes (specifically the list 148 from the feedparser HTML sanitisation web site). 149 150 ``safe_attrs``: 151 A set of attribute names to override the default list of attributes 152 considered 'safe' (when safe_attrs_only=True). 153 154 ``add_nofollow``: 155 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 156 157 ``host_whitelist``: 158 A list or set of hosts that you can use for embedded content 159 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 160 You can also implement/override the method 161 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 162 implement more complex rules for what can be embedded. 163 Anything that passes this test will be shown, regardless of 164 the value of (for instance) ``embedded``. 165 166 Note that this parameter might not work as intended if you do not 167 make the links absolute before doing the cleaning. 168 169 ``whitelist_tags``: 170 A set of tags that can be included with ``host_whitelist``. 171 The default is ``iframe`` and ``embed``; you may wish to 172 include other tags like ``script``, or you may want to 173 implement ``allow_embedded_url`` for more control. Set to None to 174 include all tags. 175 176 This modifies the document *in place*. 177 """ 178 179 scripts = True 180 javascript = True 181 comments = True 182 style = False 183 links = True 184 meta = True 185 page_structure = True 186 processing_instructions = True 187 embedded = True 188 frames = True 189 forms = True 190 annoying_tags = True 191 remove_tags = None 192 allow_tags = None 193 kill_tags = None 194 remove_unknown_tags = True 195 safe_attrs_only = True 196 safe_attrs = defs.safe_attrs 197 add_nofollow = False 198 host_whitelist = () 199 whitelist_tags = set(['iframe', 'embed']) 200
201 - def __init__(self, **kw):
202 for name, value in kw.items(): 203 if not hasattr(self, name): 204 raise TypeError( 205 "Unknown parameter: %s=%r" % (name, value)) 206 setattr(self, name, value)
207 208 # Used to lookup the primary URL for a given tag that is up for 209 # removal: 210 _tag_link_attrs = dict( 211 script='src', 212 link='href', 213 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 214 # From what I can tell, both attributes can contain a link: 215 applet=['code', 'object'], 216 iframe='src', 217 embed='src', 218 layer='src', 219 # FIXME: there doesn't really seem like a general way to figure out what 220 # links an <object> tag uses; links often go in <param> tags with values 221 # that we don't really know. You'd have to have knowledge about specific 222 # kinds of plugins (probably keyed off classid), and match against those. 223 ##object=?, 224 # FIXME: not looking at the action currently, because it is more complex 225 # than than -- if you keep the form, you should keep the form controls. 226 ##form='action', 227 a='href', 228 ) 229
230 - def __call__(self, doc):
231 """ 232 Cleans the document. 233 """ 234 if hasattr(doc, 'getroot'): 235 # ElementTree instance, instead of an element 236 doc = doc.getroot() 237 # convert XHTML to HTML 238 xhtml_to_html(doc) 239 # Normalize a case that IE treats <image> like <img>, and that 240 # can confuse either this step or later steps. 241 for el in doc.iter('image'): 242 el.tag = 'img' 243 if not self.comments: 244 # Of course, if we were going to kill comments anyway, we don't 245 # need to worry about this 246 self.kill_conditional_comments(doc) 247 248 kill_tags = set(self.kill_tags or ()) 249 remove_tags = set(self.remove_tags or ()) 250 allow_tags = set(self.allow_tags or ()) 251 252 if self.scripts: 253 kill_tags.add('script') 254 if self.safe_attrs_only: 255 safe_attrs = set(self.safe_attrs) 256 for el in doc.iter(): 257 attrib = el.attrib 258 for aname in attrib.keys(): 259 if aname not in safe_attrs: 260 del attrib[aname] 261 if self.javascript: 262 if not (self.safe_attrs_only and 263 self.safe_attrs == defs.safe_attrs): 264 # safe_attrs handles events attributes itself 265 for el in doc.iter(): 266 attrib = el.attrib 267 for aname in attrib.keys(): 268 if aname.startswith('on'): 269 del attrib[aname] 270 doc.rewrite_links(self._remove_javascript_link, 271 resolve_base_href=False) 272 if not self.style: 273 # If we're deleting style then we don't have to remove JS links 274 # from styles, otherwise... 275 for el in _find_styled_elements(doc): 276 old = el.get('style') 277 new = _css_javascript_re.sub('', old) 278 new = _css_import_re.sub('', old) 279 if self._has_sneaky_javascript(new): 280 # Something tricky is going on... 281 del el.attrib['style'] 282 elif new != old: 283 el.set('style', new) 284 for el in list(doc.iter('style')): 285 if el.get('type', '').lower().strip() == 'text/javascript': 286 el.drop_tree() 287 continue 288 old = el.text or '' 289 new = _css_javascript_re.sub('', old) 290 # The imported CSS can do anything; we just can't allow: 291 new = _css_import_re.sub('', old) 292 if self._has_sneaky_javascript(new): 293 # Something tricky is going on... 294 el.text = '/* deleted */' 295 elif new != old: 296 el.text = new 297 if self.comments or self.processing_instructions: 298 # FIXME: why either? I feel like there's some obscure reason 299 # because you can put PIs in comments...? But I've already 300 # forgotten it 301 kill_tags.add(etree.Comment) 302 if self.processing_instructions: 303 kill_tags.add(etree.ProcessingInstruction) 304 if self.style: 305 kill_tags.add('style') 306 etree.strip_attributes(doc, 'style') 307 if self.links: 308 kill_tags.add('link') 309 elif self.style or self.javascript: 310 # We must get rid of included stylesheets if Javascript is not 311 # allowed, as you can put Javascript in them 312 for el in list(doc.iter('link')): 313 if 'stylesheet' in el.get('rel', '').lower(): 314 # Note this kills alternate stylesheets as well 315 el.drop_tree() 316 if self.meta: 317 kill_tags.add('meta') 318 if self.page_structure: 319 remove_tags.update(('head', 'html', 'title')) 320 if self.embedded: 321 # FIXME: is <layer> really embedded? 322 # We should get rid of any <param> tags not inside <applet>; 323 # These are not really valid anyway. 324 for el in list(doc.iter('param')): 325 found_parent = False 326 parent = el.getparent() 327 while parent is not None and parent.tag not in ('applet', 'object'): 328 parent = parent.getparent() 329 if parent is None: 330 el.drop_tree() 331 kill_tags.update(('applet',)) 332 # The alternate contents that are in an iframe are a good fallback: 333 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 334 if self.frames: 335 # FIXME: ideally we should look at the frame links, but 336 # generally frames don't mix properly with an HTML 337 # fragment anyway. 338 kill_tags.update(defs.frame_tags) 339 if self.forms: 340 remove_tags.add('form') 341 kill_tags.update(('button', 'input', 'select', 'textarea')) 342 if self.annoying_tags: 343 remove_tags.update(('blink', 'marquee')) 344 345 _remove = [] 346 _kill = [] 347 for el in doc.iter(): 348 if el.tag in kill_tags: 349 if self.allow_element(el): 350 continue 351 _kill.append(el) 352 elif el.tag in remove_tags: 353 if self.allow_element(el): 354 continue 355 _remove.append(el) 356 357 if _remove and _remove[0] == doc: 358 # We have to drop the parent-most tag, which we can't 359 # do. Instead we'll rewrite it: 360 el = _remove.pop(0) 361 el.tag = 'div' 362 el.attrib.clear() 363 elif _kill and _kill[0] == doc: 364 # We have to drop the parent-most element, which we can't 365 # do. Instead we'll clear it: 366 el = _kill.pop(0) 367 if el.tag != 'html': 368 el.tag = 'div' 369 el.clear() 370 371 _kill.reverse() # start with innermost tags 372 for el in _kill: 373 el.drop_tree() 374 for el in _remove: 375 el.drop_tag() 376 377 allow_tags = self.allow_tags 378 if self.remove_unknown_tags: 379 if allow_tags: 380 raise ValueError( 381 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 382 allow_tags = set(defs.tags) 383 if allow_tags: 384 bad = [] 385 for el in doc.iter(): 386 if el.tag not in allow_tags: 387 bad.append(el) 388 if bad: 389 if bad[0] is doc: 390 el = bad.pop(0) 391 el.tag = 'div' 392 el.attrib.clear() 393 for el in bad: 394 el.drop_tag() 395 if self.add_nofollow: 396 for el in _find_external_links(doc): 397 if not self.allow_follow(el): 398 el.set('rel', 'nofollow')
399
400 - def allow_follow(self, anchor):
401 """ 402 Override to suppress rel="nofollow" on some anchors. 403 """ 404 return False
405
406 - def allow_element(self, el):
407 if el.tag not in self._tag_link_attrs: 408 return False 409 attr = self._tag_link_attrs[el.tag] 410 if isinstance(attr, (list, tuple)): 411 for one_attr in attr: 412 url = el.get(one_attr) 413 if not url: 414 return False 415 if not self.allow_embedded_url(el, url): 416 return False 417 return True 418 else: 419 url = el.get(attr) 420 if not url: 421 return False 422 return self.allow_embedded_url(el, url)
423
424 - def allow_embedded_url(self, el, url):
425 if (self.whitelist_tags is not None 426 and el.tag not in self.whitelist_tags): 427 return False 428 scheme, netloc, path, query, fragment = urlsplit(url) 429 netloc = netloc.lower().split(':', 1)[0] 430 if scheme not in ('http', 'https'): 431 return False 432 if netloc in self.host_whitelist: 433 return True 434 return False
435
436 - def kill_conditional_comments(self, doc):
437 """ 438 IE conditional comments basically embed HTML that the parser 439 doesn't normally see. We can't allow anything like that, so 440 we'll kill any comments that could be conditional. 441 """ 442 bad = [] 443 self._kill_elements( 444 doc, lambda el: _conditional_comment_re.search(el.text), 445 etree.Comment)
446
447 - def _kill_elements(self, doc, condition, iterate=None):
448 bad = [] 449 for el in doc.iter(iterate): 450 if condition(el): 451 bad.append(el) 452 for el in bad: 453 el.drop_tree()
454 462 463 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 464
465 - def _has_sneaky_javascript(self, style):
466 """ 467 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 468 can get interpreted, or ``expre/* stuff */ssion(...)``. This 469 checks for attempt to do stuff like this. 470 471 Typically the response will be to kill the entire style; if you 472 have just a bit of Javascript in the style another rule will catch 473 that and remove only the Javascript from the style; this catches 474 more sneaky attempts. 475 """ 476 style = self._substitute_comments('', style) 477 style = style.replace('\\', '') 478 style = _substitute_whitespace('', style) 479 style = style.lower() 480 if 'javascript:' in style: 481 return True 482 if 'expression(' in style: 483 return True 484 return False
485
486 - def clean_html(self, html):
487 result_type = type(html) 488 if isinstance(html, basestring): 489 doc = fromstring(html) 490 else: 491 doc = copy.deepcopy(html) 492 self(doc) 493 return _transform_result(result_type, doc)
494 495 clean = Cleaner() 496 clean_html = clean.clean_html 497 498 ############################################################ 499 ## Autolinking 500 ############################################################ 501 502 _link_regexes = [ 503 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 504 # This is conservative, but autolinking can be a bit conservative: 505 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 506 ] 507 508 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 509 510 _avoid_hosts = [ 511 re.compile(r'^localhost', re.I), 512 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 513 re.compile(r'^127\.0\.0\.1$'), 514 ] 515 516 _avoid_classes = ['nolink'] 517 562 620 629 630 autolink_html.__doc__ = autolink.__doc__ 631 632 ############################################################ 633 ## Word wrapping 634 ############################################################ 635 636 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 637 _avoid_word_break_classes = ['nobreak'] 638
639 -def word_break(el, max_width=40, 640 avoid_elements=_avoid_word_break_elements, 641 avoid_classes=_avoid_word_break_classes, 642 break_character=unichr(0x200b)):
643 """ 644 Breaks any long words found in the body of the text (not attributes). 645 646 Doesn't effect any of the tags in avoid_elements, by default 647 ``<textarea>`` and ``<pre>`` 648 649 Breaks words by inserting &#8203;, which is a unicode character 650 for Zero Width Space character. This generally takes up no space 651 in rendering, but does copy as a space, and in monospace contexts 652 usually takes up space. 653 654 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 655 """ 656 # Character suggestion of &#8203 comes from: 657 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 658 if el.tag in _avoid_word_break_elements: 659 return 660 class_name = el.get('class') 661 if class_name: 662 dont_break = False 663 class_name = class_name.split() 664 for avoid in avoid_classes: 665 if avoid in class_name: 666 dont_break = True 667 break 668 if dont_break: 669 return 670 if el.text: 671 el.text = _break_text(el.text, max_width, break_character) 672 for child in el: 673 word_break(child, max_width=max_width, 674 avoid_elements=avoid_elements, 675 avoid_classes=avoid_classes, 676 break_character=break_character) 677 if child.tail: 678 child.tail = _break_text(child.tail, max_width, break_character)
679
680 -def word_break_html(html, *args, **kw):
681 result_type = type(html) 682 doc = fromstring(html) 683 word_break(doc, *args, **kw) 684 return _transform_result(result_type, doc)
685
686 -def _break_text(text, max_width, break_character):
687 words = text.split() 688 for word in words: 689 if len(word) > max_width: 690 replacement = _insert_break(word, max_width, break_character) 691 text = text.replace(word, replacement) 692 return text
693 694 _break_prefer_re = re.compile(r'[^a-z]', re.I) 695
696 -def _insert_break(word, width, break_character):
697 orig_word = word 698 result = '' 699 while len(word) > width: 700 start = word[:width] 701 breaks = list(_break_prefer_re.finditer(start)) 702 if breaks: 703 last_break = breaks[-1] 704 # Only walk back up to 10 characters to find a nice break: 705 if last_break.end() > width-10: 706 # FIXME: should the break character be at the end of the 707 # chunk, or the beginning of the next chunk? 708 start = word[:last_break.end()] 709 result += start + break_character 710 word = word[len(start):] 711 result += word 712 return result
713