Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22      # Python 3 
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27      # Python 3 
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32      # Python < 2.6 
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 44  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 45  # I have multiple kinds of schemes searched; but should schemes be 
 46  #   whitelisted instead? 
 47  # max height? 
 48  # remove images?  Also in CSS?  background attribute? 
 49  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 50  #   allow *just* embedded YouTube movies) 
 51  # Log what was deleted and why? 
 52  # style="behavior: ..." might be bad in IE? 
 53  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 54  #   metas. 
 55  # UTF-7 detections?  Example: 
 56  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 57  #   you don't always have to have the charset set, if the page has no charset 
 58  #   and there's UTF7-like code in it. 
 59  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 60   
 61   
 62  # This is an IE-specific construct you can have in a stylesheet to 
 63  # run some Javascript: 
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67  # Do I have to worry about @\nimport? 
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71  # All kinds of schemes besides just javascript: that can cause 
 72  # execution: 
 73  _javascript_scheme_re = re.compile( 
 74      r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I) 
 75  _substitute_whitespace = re.compile(r'\s+').sub 
 76  # FIXME: should data: be blocked? 
 77   
 78  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 79  _conditional_comment_re = re.compile( 
 80      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 81   
 82  _find_styled_elements = etree.XPath( 
 83      "descendant-or-self::*[@style]") 
 84   
 85  _find_external_links = etree.XPath( 
 86      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 87       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 88      namespaces={'x':XHTML_NAMESPACE}) 
 89   
90 -class Cleaner(object):
91 """ 92 Instances cleans the document of each of the possible offending 93 elements. The cleaning is controlled by attributes; you can 94 override attributes in a subclass, or set them in the constructor. 95 96 ``scripts``: 97 Removes any ``<script>`` tags. 98 99 ``javascript``: 100 Removes any Javascript, like an ``onclick`` attribute. 101 102 ``comments``: 103 Removes any comments. 104 105 ``style``: 106 Removes any style tags or attributes. 107 108 ``links``: 109 Removes any ``<link>`` tags 110 111 ``meta``: 112 Removes any ``<meta>`` tags 113 114 ``page_structure``: 115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 116 117 ``processing_instructions``: 118 Removes any processing instructions. 119 120 ``embedded``: 121 Removes any embedded objects (flash, iframes) 122 123 ``frames``: 124 Removes any frame-related tags 125 126 ``forms``: 127 Removes any form tags 128 129 ``annoying_tags``: 130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 131 132 ``remove_tags``: 133 A list of tags to remove. Only the tags will be removed, 134 their content will get pulled up into the parent tag. 135 136 ``kill_tags``: 137 A list of tags to kill. Killing also removes the tag's content, 138 i.e. the whole subtree, not just the tag itself. 139 140 ``allow_tags``: 141 A list of tags to include (default include all). 142 143 ``remove_unknown_tags``: 144 Remove any tags that aren't standard parts of HTML. 145 146 ``safe_attrs_only``: 147 If true, only include 'safe' attributes (specifically the list 148 from `feedparser 149 <http://feedparser.org/docs/html-sanitization.html>`_). 150 151 ``add_nofollow``: 152 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 153 154 ``host_whitelist``: 155 A list or set of hosts that you can use for embedded content 156 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 157 You can also implement/override the method 158 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 159 implement more complex rules for what can be embedded. 160 Anything that passes this test will be shown, regardless of 161 the value of (for instance) ``embedded``. 162 163 Note that this parameter might not work as intended if you do not 164 make the links absolute before doing the cleaning. 165 166 ``whitelist_tags``: 167 A set of tags that can be included with ``host_whitelist``. 168 The default is ``iframe`` and ``embed``; you may wish to 169 include other tags like ``script``, or you may want to 170 implement ``allow_embedded_url`` for more control. Set to None to 171 include all tags. 172 173 This modifies the document *in place*. 174 """ 175 176 scripts = True 177 javascript = True 178 comments = True 179 style = False 180 links = True 181 meta = True 182 page_structure = True 183 processing_instructions = True 184 embedded = True 185 frames = True 186 forms = True 187 annoying_tags = True 188 remove_tags = None 189 allow_tags = None 190 kill_tags = None 191 remove_unknown_tags = True 192 safe_attrs_only = True 193 add_nofollow = False 194 host_whitelist = () 195 whitelist_tags = set(['iframe', 'embed']) 196
197 - def __init__(self, **kw):
198 for name, value in kw.items(): 199 if not hasattr(self, name): 200 raise TypeError( 201 "Unknown parameter: %s=%r" % (name, value)) 202 setattr(self, name, value)
203 204 # Used to lookup the primary URL for a given tag that is up for 205 # removal: 206 _tag_link_attrs = dict( 207 script='src', 208 link='href', 209 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 210 # From what I can tell, both attributes can contain a link: 211 applet=['code', 'object'], 212 iframe='src', 213 embed='src', 214 layer='src', 215 # FIXME: there doesn't really seem like a general way to figure out what 216 # links an <object> tag uses; links often go in <param> tags with values 217 # that we don't really know. You'd have to have knowledge about specific 218 # kinds of plugins (probably keyed off classid), and match against those. 219 ##object=?, 220 # FIXME: not looking at the action currently, because it is more complex 221 # than than -- if you keep the form, you should keep the form controls. 222 ##form='action', 223 a='href', 224 ) 225
226 - def __call__(self, doc):
227 """ 228 Cleans the document. 229 """ 230 if hasattr(doc, 'getroot'): 231 # ElementTree instance, instead of an element 232 doc = doc.getroot() 233 # convert XHTML to HTML 234 xhtml_to_html(doc) 235 # Normalize a case that IE treats <image> like <img>, and that 236 # can confuse either this step or later steps. 237 for el in doc.iter('image'): 238 el.tag = 'img' 239 if not self.comments: 240 # Of course, if we were going to kill comments anyway, we don't 241 # need to worry about this 242 self.kill_conditional_comments(doc) 243 244 kill_tags = set(self.kill_tags or ()) 245 remove_tags = set(self.remove_tags or ()) 246 allow_tags = set(self.allow_tags or ()) 247 248 if self.scripts: 249 kill_tags.add('script') 250 if self.safe_attrs_only: 251 safe_attrs = set(defs.safe_attrs) 252 for el in doc.iter(): 253 attrib = el.attrib 254 for aname in attrib.keys(): 255 if aname not in safe_attrs: 256 del attrib[aname] 257 if self.javascript: 258 if not self.safe_attrs_only: 259 # safe_attrs handles events attributes itself 260 for el in doc.iter(): 261 attrib = el.attrib 262 for aname in attrib.keys(): 263 if aname.startswith('on'): 264 del attrib[aname] 265 doc.rewrite_links(self._remove_javascript_link, 266 resolve_base_href=False) 267 if not self.style: 268 # If we're deleting style then we don't have to remove JS links 269 # from styles, otherwise... 270 for el in _find_styled_elements(doc): 271 old = el.get('style') 272 new = _css_javascript_re.sub('', old) 273 new = _css_import_re.sub('', old) 274 if self._has_sneaky_javascript(new): 275 # Something tricky is going on... 276 del el.attrib['style'] 277 elif new != old: 278 el.set('style', new) 279 for el in list(doc.iter('style')): 280 if el.get('type', '').lower().strip() == 'text/javascript': 281 el.drop_tree() 282 continue 283 old = el.text or '' 284 new = _css_javascript_re.sub('', old) 285 # The imported CSS can do anything; we just can't allow: 286 new = _css_import_re.sub('', old) 287 if self._has_sneaky_javascript(new): 288 # Something tricky is going on... 289 el.text = '/* deleted */' 290 elif new != old: 291 el.text = new 292 if self.comments or self.processing_instructions: 293 # FIXME: why either? I feel like there's some obscure reason 294 # because you can put PIs in comments...? But I've already 295 # forgotten it 296 kill_tags.add(etree.Comment) 297 if self.processing_instructions: 298 kill_tags.add(etree.ProcessingInstruction) 299 if self.style: 300 kill_tags.add('style') 301 etree.strip_attributes(doc, 'style') 302 if self.links: 303 kill_tags.add('link') 304 elif self.style or self.javascript: 305 # We must get rid of included stylesheets if Javascript is not 306 # allowed, as you can put Javascript in them 307 for el in list(doc.iter('link')): 308 if 'stylesheet' in el.get('rel', '').lower(): 309 # Note this kills alternate stylesheets as well 310 el.drop_tree() 311 if self.meta: 312 kill_tags.add('meta') 313 if self.page_structure: 314 remove_tags.update(('head', 'html', 'title')) 315 if self.embedded: 316 # FIXME: is <layer> really embedded? 317 # We should get rid of any <param> tags not inside <applet>; 318 # These are not really valid anyway. 319 for el in list(doc.iter('param')): 320 found_parent = False 321 parent = el.getparent() 322 while parent is not None and parent.tag not in ('applet', 'object'): 323 parent = parent.getparent() 324 if parent is None: 325 el.drop_tree() 326 kill_tags.update(('applet',)) 327 # The alternate contents that are in an iframe are a good fallback: 328 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 329 if self.frames: 330 # FIXME: ideally we should look at the frame links, but 331 # generally frames don't mix properly with an HTML 332 # fragment anyway. 333 kill_tags.update(defs.frame_tags) 334 if self.forms: 335 remove_tags.add('form') 336 kill_tags.update(('button', 'input', 'select', 'textarea')) 337 if self.annoying_tags: 338 remove_tags.update(('blink', 'marquee')) 339 340 _remove = [] 341 _kill = [] 342 for el in doc.iter(): 343 if el.tag in kill_tags: 344 if self.allow_element(el): 345 continue 346 _kill.append(el) 347 elif el.tag in remove_tags: 348 if self.allow_element(el): 349 continue 350 _remove.append(el) 351 352 if _remove and _remove[0] == doc: 353 # We have to drop the parent-most tag, which we can't 354 # do. Instead we'll rewrite it: 355 el = _remove.pop(0) 356 el.tag = 'div' 357 el.attrib.clear() 358 elif _kill and _kill[0] == doc: 359 # We have to drop the parent-most element, which we can't 360 # do. Instead we'll clear it: 361 el = _kill.pop(0) 362 if el.tag != 'html': 363 el.tag = 'div' 364 el.clear() 365 366 _kill.reverse() # start with innermost tags 367 for el in _kill: 368 el.drop_tree() 369 for el in _remove: 370 el.drop_tag() 371 372 allow_tags = self.allow_tags 373 if self.remove_unknown_tags: 374 if allow_tags: 375 raise ValueError( 376 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 377 allow_tags = set(defs.tags) 378 if allow_tags: 379 bad = [] 380 for el in doc.iter(): 381 if el.tag not in allow_tags: 382 bad.append(el) 383 if bad: 384 if bad[0] is doc: 385 el = bad.pop(0) 386 el.tag = 'div' 387 el.attrib.clear() 388 for el in bad: 389 el.drop_tag() 390 if self.add_nofollow: 391 for el in _find_external_links(doc): 392 if not self.allow_follow(el): 393 el.set('rel', 'nofollow')
394
395 - def allow_follow(self, anchor):
396 """ 397 Override to suppress rel="nofollow" on some anchors. 398 """ 399 return False
400
401 - def allow_element(self, el):
402 if el.tag not in self._tag_link_attrs: 403 return False 404 attr = self._tag_link_attrs[el.tag] 405 if isinstance(attr, (list, tuple)): 406 for one_attr in attr: 407 url = el.get(one_attr) 408 if not url: 409 return False 410 if not self.allow_embedded_url(el, url): 411 return False 412 return True 413 else: 414 url = el.get(attr) 415 if not url: 416 return False 417 return self.allow_embedded_url(el, url)
418
419 - def allow_embedded_url(self, el, url):
420 if (self.whitelist_tags is not None 421 and el.tag not in self.whitelist_tags): 422 return False 423 scheme, netloc, path, query, fragment = urlsplit(url) 424 netloc = netloc.lower().split(':', 1)[0] 425 if scheme not in ('http', 'https'): 426 return False 427 if netloc in self.host_whitelist: 428 return True 429 return False
430
431 - def kill_conditional_comments(self, doc):
432 """ 433 IE conditional comments basically embed HTML that the parser 434 doesn't normally see. We can't allow anything like that, so 435 we'll kill any comments that could be conditional. 436 """ 437 bad = [] 438 self._kill_elements( 439 doc, lambda el: _conditional_comment_re.search(el.text), 440 etree.Comment)
441
442 - def _kill_elements(self, doc, condition, iterate=None):
443 bad = [] 444 for el in doc.iter(iterate): 445 if condition(el): 446 bad.append(el) 447 for el in bad: 448 el.drop_tree()
449 457 458 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 459
460 - def _has_sneaky_javascript(self, style):
461 """ 462 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 463 can get interpreted, or ``expre/* stuff */ssion(...)``. This 464 checks for attempt to do stuff like this. 465 466 Typically the response will be to kill the entire style; if you 467 have just a bit of Javascript in the style another rule will catch 468 that and remove only the Javascript from the style; this catches 469 more sneaky attempts. 470 """ 471 style = self._substitute_comments('', style) 472 style = style.replace('\\', '') 473 style = _substitute_whitespace('', style) 474 style = style.lower() 475 if 'javascript:' in style: 476 return True 477 if 'expression(' in style: 478 return True 479 return False
480
481 - def clean_html(self, html):
482 result_type = type(html) 483 if isinstance(html, basestring): 484 doc = fromstring(html) 485 else: 486 doc = copy.deepcopy(html) 487 self(doc) 488 return _transform_result(result_type, doc)
489 490 clean = Cleaner() 491 clean_html = clean.clean_html 492 493 ############################################################ 494 ## Autolinking 495 ############################################################ 496 497 _link_regexes = [ 498 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 499 # This is conservative, but autolinking can be a bit conservative: 500 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 501 ] 502 503 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 504 505 _avoid_hosts = [ 506 re.compile(r'^localhost', re.I), 507 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 508 re.compile(r'^127\.0\.0\.1$'), 509 ] 510 511 _avoid_classes = ['nolink'] 512 557 615 624 625 autolink_html.__doc__ = autolink.__doc__ 626 627 ############################################################ 628 ## Word wrapping 629 ############################################################ 630 631 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 632 _avoid_word_break_classes = ['nobreak'] 633
634 -def word_break(el, max_width=40, 635 avoid_elements=_avoid_word_break_elements, 636 avoid_classes=_avoid_word_break_classes, 637 break_character=unichr(0x200b)):
638 """ 639 Breaks any long words found in the body of the text (not attributes). 640 641 Doesn't effect any of the tags in avoid_elements, by default 642 ``<textarea>`` and ``<pre>`` 643 644 Breaks words by inserting &#8203;, which is a unicode character 645 for Zero Width Space character. This generally takes up no space 646 in rendering, but does copy as a space, and in monospace contexts 647 usually takes up space. 648 649 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 650 """ 651 # Character suggestion of &#8203 comes from: 652 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 653 if el.tag in _avoid_word_break_elements: 654 return 655 class_name = el.get('class') 656 if class_name: 657 dont_break = False 658 class_name = class_name.split() 659 for avoid in avoid_classes: 660 if avoid in class_name: 661 dont_break = True 662 break 663 if dont_break: 664 return 665 if el.text: 666 el.text = _break_text(el.text, max_width, break_character) 667 for child in el: 668 word_break(child, max_width=max_width, 669 avoid_elements=avoid_elements, 670 avoid_classes=avoid_classes, 671 break_character=break_character) 672 if child.tail: 673 child.tail = _break_text(child.tail, max_width, break_character)
674
675 -def word_break_html(html, *args, **kw):
676 result_type = type(html) 677 doc = fromstring(html) 678 word_break(doc, *args, **kw) 679 return _transform_result(result_type, doc)
680
681 -def _break_text(text, max_width, break_character):
682 words = text.split() 683 for word in words: 684 if len(word) > max_width: 685 replacement = _insert_break(word, max_width, break_character) 686 text = text.replace(word, replacement) 687 return text
688 689 _break_prefer_re = re.compile(r'[^a-z]', re.I) 690
691 -def _insert_break(word, width, break_character):
692 orig_word = word 693 result = '' 694 while len(word) > width: 695 start = word[:width] 696 breaks = list(_break_prefer_re.finditer(start)) 697 if breaks: 698 last_break = breaks[-1] 699 # Only walk back up to 10 characters to find a nice break: 700 if last_break.end() > width-10: 701 # FIXME: should the break character be at the end of the 702 # chunk, or the beginning of the next chunk? 703 start = word[:last_break.end()] 704 result += start + break_character 705 word = word[len(start):] 706 result += word 707 return result
708