Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import _nons, _transform_result 
 18   
 19  try: 
 20      set 
 21  except NameError: 
 22      # Python 3 
 23      from sets import Set as set 
 24   
 25  try: 
 26      unichr = __builtins__['unichr'] 
 27  except (NameError, KeyError): 
 28      # Python 3 
 29      unichr = chr 
 30   
 31  try: 
 32      unicode = __builtins__['unicode'] 
 33  except (NameError, KeyError): 
 34      # Python 3 
 35      unicode = str 
 36   
 37  try: 
 38      bytes = __builtins__['bytes'] 
 39  except (NameError, KeyError): 
 40      # Python < 2.6 
 41      bytes = str 
 42   
 43  try: 
 44      basestring = __builtins__['basestring'] 
 45  except (NameError, KeyError): 
 46      basestring = (str, bytes) 
 47   
 48   
 49  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 50             'word_break', 'word_break_html'] 
 51   
 52  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 53  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 54  # I have multiple kinds of schemes searched; but should schemes be 
 55  #   whitelisted instead? 
 56  # max height? 
 57  # remove images?  Also in CSS?  background attribute? 
 58  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 59  #   allow *just* embedded YouTube movies) 
 60  # Log what was deleted and why? 
 61  # style="behavior: ..." might be bad in IE? 
 62  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 63  #   metas. 
 64  # UTF-7 detections?  Example: 
 65  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 66  #   you don't always have to have the charset set, if the page has no charset 
 67  #   and there's UTF7-like code in it. 
 68  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 69   
 70   
 71  # This is an IE-specific construct you can have in a stylesheet to 
 72  # run some Javascript: 
 73  _css_javascript_re = re.compile( 
 74      r'expression\s*\(.*?\)', re.S|re.I) 
 75   
 76  # Do I have to worry about @\nimport? 
 77  _css_import_re = re.compile( 
 78      r'@\s*import', re.I) 
 79   
 80  # All kinds of schemes besides just javascript: that can cause 
 81  # execution: 
 82  _javascript_scheme_re = re.compile( 
 83      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 84  _substitute_whitespace = re.compile(r'\s+').sub 
 85  # FIXME: should data: be blocked? 
 86   
 87  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 88  _conditional_comment_re = re.compile( 
 89      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 90   
 91  _find_styled_elements = etree.XPath( 
 92      "descendant-or-self::*[@style]") 
 93   
 94  _find_external_links = etree.XPath( 
 95      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 96       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 97      namespaces={'x':XHTML_NAMESPACE}) 
 98   
99 -class Cleaner(object):
100 """ 101 Instances cleans the document of each of the possible offending 102 elements. The cleaning is controlled by attributes; you can 103 override attributes in a subclass, or set them in the constructor. 104 105 ``scripts``: 106 Removes any ``<script>`` tags. 107 108 ``javascript``: 109 Removes any Javascript, like an ``onclick`` attribute. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 140 141 ``remove_tags``: 142 A list of tags to remove. 143 144 ``allow_tags``: 145 A list of tags to include (default include all). 146 147 ``remove_unknown_tags``: 148 Remove any tags that aren't standard parts of HTML. 149 150 ``safe_attrs_only``: 151 If true, only include 'safe' attributes (specifically the list 152 from `feedparser 153 <http://feedparser.org/docs/html-sanitization.html>`_). 154 155 ``add_nofollow``: 156 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 157 158 ``host_whitelist``: 159 A list or set of hosts that you can use for embedded content 160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 161 You can also implement/override the method 162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 163 implement more complex rules for what can be embedded. 164 Anything that passes this test will be shown, regardless of 165 the value of (for instance) ``embedded``. 166 167 Note that this parameter might not work as intended if you do not 168 make the links absolute before doing the cleaning. 169 170 ``whitelist_tags``: 171 A set of tags that can be included with ``host_whitelist``. 172 The default is ``iframe`` and ``embed``; you may wish to 173 include other tags like ``script``, or you may want to 174 implement ``allow_embedded_url`` for more control. Set to None to 175 include all tags. 176 177 This modifies the document *in place*. 178 """ 179 180 scripts = True 181 javascript = True 182 comments = True 183 style = False 184 links = True 185 meta = True 186 page_structure = True 187 processing_instructions = True 188 embedded = True 189 frames = True 190 forms = True 191 annoying_tags = True 192 remove_tags = None 193 allow_tags = None 194 remove_unknown_tags = True 195 safe_attrs_only = True 196 add_nofollow = False 197 host_whitelist = () 198 whitelist_tags = set(['iframe', 'embed']) 199
200 - def __init__(self, **kw):
201 for name, value in kw.items(): 202 if not hasattr(self, name): 203 raise TypeError( 204 "Unknown parameter: %s=%r" % (name, value)) 205 setattr(self, name, value)
206 207 # Used to lookup the primary URL for a given tag that is up for 208 # removal: 209 _tag_link_attrs = dict( 210 script='src', 211 link='href', 212 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 213 # From what I can tell, both attributes can contain a link: 214 applet=['code', 'object'], 215 iframe='src', 216 embed='src', 217 layer='src', 218 # FIXME: there doesn't really seem like a general way to figure out what 219 # links an <object> tag uses; links often go in <param> tags with values 220 # that we don't really know. You'd have to have knowledge about specific 221 # kinds of plugins (probably keyed off classid), and match against those. 222 ##object=?, 223 # FIXME: not looking at the action currently, because it is more complex 224 # than than -- if you keep the form, you should keep the form controls. 225 ##form='action', 226 a='href', 227 ) 228
229 - def __call__(self, doc):
230 """ 231 Cleans the document. 232 """ 233 if hasattr(doc, 'getroot'): 234 # ElementTree instance, instead of an element 235 doc = doc.getroot() 236 # convert XHTML to HTML 237 for el in doc.iter(): 238 tag = el.tag 239 if isinstance(tag, basestring): 240 el.tag = _nons(tag) 241 # Normalize a case that IE treats <image> like <img>, and that 242 # can confuse either this step or later steps. 243 for el in doc.iter('image'): 244 el.tag = 'img' 245 if not self.comments: 246 # Of course, if we were going to kill comments anyway, we don't 247 # need to worry about this 248 self.kill_conditional_comments(doc) 249 kill_tags = set() 250 remove_tags = set(self.remove_tags or ()) 251 if self.allow_tags: 252 allow_tags = set(self.allow_tags) 253 else: 254 allow_tags = set() 255 if self.scripts: 256 kill_tags.add('script') 257 if self.safe_attrs_only: 258 safe_attrs = set(defs.safe_attrs) 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname not in safe_attrs: 263 del attrib[aname] 264 if self.javascript: 265 if not self.safe_attrs_only: 266 # safe_attrs handles events attributes itself 267 for el in doc.iter(): 268 attrib = el.attrib 269 for aname in attrib.keys(): 270 if aname.startswith('on'): 271 del attrib[aname] 272 doc.rewrite_links(self._remove_javascript_link, 273 resolve_base_href=False) 274 if not self.style: 275 # If we're deleting style then we don't have to remove JS links 276 # from styles, otherwise... 277 for el in _find_styled_elements(doc): 278 old = el.get('style') 279 new = _css_javascript_re.sub('', old) 280 new = _css_import_re.sub('', old) 281 if self._has_sneaky_javascript(new): 282 # Something tricky is going on... 283 del el.attrib['style'] 284 elif new != old: 285 el.set('style', new) 286 for el in list(doc.iter('style')): 287 if el.get('type', '').lower().strip() == 'text/javascript': 288 el.drop_tree() 289 continue 290 old = el.text or '' 291 new = _css_javascript_re.sub('', old) 292 # The imported CSS can do anything; we just can't allow: 293 new = _css_import_re.sub('', old) 294 if self._has_sneaky_javascript(new): 295 # Something tricky is going on... 296 el.text = '/* deleted */' 297 elif new != old: 298 el.text = new 299 if self.comments or self.processing_instructions: 300 # FIXME: why either? I feel like there's some obscure reason 301 # because you can put PIs in comments...? But I've already 302 # forgotten it 303 kill_tags.add(etree.Comment) 304 if self.processing_instructions: 305 kill_tags.add(etree.ProcessingInstruction) 306 if self.style: 307 kill_tags.add('style') 308 for el in _find_styled_elements(doc): 309 del el.attrib['style'] 310 if self.links: 311 kill_tags.add('link') 312 elif self.style or self.javascript: 313 # We must get rid of included stylesheets if Javascript is not 314 # allowed, as you can put Javascript in them 315 for el in list(doc.iter('link')): 316 if 'stylesheet' in el.get('rel', '').lower(): 317 # Note this kills alternate stylesheets as well 318 el.drop_tree() 319 if self.meta: 320 kill_tags.add('meta') 321 if self.page_structure: 322 remove_tags.update(('head', 'html', 'title')) 323 if self.embedded: 324 # FIXME: is <layer> really embedded? 325 # We should get rid of any <param> tags not inside <applet>; 326 # These are not really valid anyway. 327 for el in list(doc.iter('param')): 328 found_parent = False 329 parent = el.getparent() 330 while parent is not None and parent.tag not in ('applet', 'object'): 331 parent = parent.getparent() 332 if parent is None: 333 el.drop_tree() 334 kill_tags.update(('applet',)) 335 # The alternate contents that are in an iframe are a good fallback: 336 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 337 if self.frames: 338 # FIXME: ideally we should look at the frame links, but 339 # generally frames don't mix properly with an HTML 340 # fragment anyway. 341 kill_tags.update(defs.frame_tags) 342 if self.forms: 343 remove_tags.add('form') 344 kill_tags.update(('button', 'input', 'select', 'textarea')) 345 if self.annoying_tags: 346 remove_tags.update(('blink', 'marque')) 347 348 _remove = [] 349 _kill = [] 350 for el in doc.iter(): 351 if el.tag in kill_tags: 352 if self.allow_element(el): 353 continue 354 _kill.append(el) 355 elif el.tag in remove_tags: 356 if self.allow_element(el): 357 continue 358 _remove.append(el) 359 360 if _remove and _remove[0] == doc: 361 # We have to drop the parent-most tag, which we can't 362 # do. Instead we'll rewrite it: 363 el = _remove.pop(0) 364 el.tag = 'div' 365 el.attrib.clear() 366 elif _kill and _kill[0] == doc: 367 # We have to drop the parent-most element, which we can't 368 # do. Instead we'll clear it: 369 el = _kill.pop(0) 370 if el.tag != 'html': 371 el.tag = 'div' 372 el.clear() 373 374 for el in _kill: 375 el.drop_tree() 376 for el in _remove: 377 el.drop_tag() 378 379 allow_tags = self.allow_tags 380 if self.remove_unknown_tags: 381 if allow_tags: 382 raise ValueError( 383 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 384 allow_tags = set(defs.tags) 385 if allow_tags: 386 bad = [] 387 for el in doc.iter(): 388 if el.tag not in allow_tags: 389 bad.append(el) 390 for el in bad: 391 el.drop_tag() 392 if self.add_nofollow: 393 for el in _find_external_links(doc): 394 if not self.allow_follow(el): 395 el.set('rel', 'nofollow')
396
397 - def allow_follow(self, anchor):
398 """ 399 Override to suppress rel="nofollow" on some anchors. 400 """ 401 return False
402
403 - def allow_element(self, el):
404 if el.tag not in self._tag_link_attrs: 405 return False 406 attr = self._tag_link_attrs[el.tag] 407 if isinstance(attr, (list, tuple)): 408 for one_attr in attr: 409 url = el.get(one_attr) 410 if not url: 411 return False 412 if not self.allow_embedded_url(el, url): 413 return False 414 return True 415 else: 416 url = el.get(attr) 417 if not url: 418 return False 419 return self.allow_embedded_url(el, url)
420
421 - def allow_embedded_url(self, el, url):
422 if (self.whitelist_tags is not None 423 and el.tag not in self.whitelist_tags): 424 return False 425 scheme, netloc, path, query, fragment = urlsplit(url) 426 netloc = netloc.lower().split(':', 1)[0] 427 if scheme not in ('http', 'https'): 428 return False 429 if netloc in self.host_whitelist: 430 return True 431 return False
432
433 - def kill_conditional_comments(self, doc):
434 """ 435 IE conditional comments basically embed HTML that the parser 436 doesn't normally see. We can't allow anything like that, so 437 we'll kill any comments that could be conditional. 438 """ 439 bad = [] 440 self._kill_elements( 441 doc, lambda el: _conditional_comment_re.search(el.text), 442 etree.Comment)
443
444 - def _kill_elements(self, doc, condition, iterate=None):
445 bad = [] 446 for el in doc.iter(iterate): 447 if condition(el): 448 bad.append(el) 449 for el in bad: 450 el.drop_tree()
451 459 460 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 461
462 - def _has_sneaky_javascript(self, style):
463 """ 464 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 465 can get interpreted, or ``expre/* stuff */ssion(...)``. This 466 checks for attempt to do stuff like this. 467 468 Typically the response will be to kill the entire style; if you 469 have just a bit of Javascript in the style another rule will catch 470 that and remove only the Javascript from the style; this catches 471 more sneaky attempts. 472 """ 473 style = self._substitute_comments('', style) 474 style = style.replace('\\', '') 475 style = _substitute_whitespace('', style) 476 style = style.lower() 477 if 'javascript:' in style: 478 return True 479 if 'expression(' in style: 480 return True 481 return False
482
483 - def clean_html(self, html):
484 result_type = type(html) 485 if isinstance(html, basestring): 486 doc = fromstring(html) 487 else: 488 doc = copy.deepcopy(html) 489 self(doc) 490 return _transform_result(result_type, doc)
491 492 clean = Cleaner() 493 clean_html = clean.clean_html 494 495 ############################################################ 496 ## Autolinking 497 ############################################################ 498 499 _link_regexes = [ 500 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), 501 # This is conservative, but autolinking can be a bit conservative: 502 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 503 ] 504 505 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 506 507 _avoid_hosts = [ 508 re.compile(r'^localhost', re.I), 509 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 510 re.compile(r'^127\.0\.0\.1$'), 511 ] 512 513 _avoid_classes = ['nolink'] 514 559 617 626 627 autolink_html.__doc__ = autolink.__doc__ 628 629 ############################################################ 630 ## Word wrapping 631 ############################################################ 632 633 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 634 _avoid_word_break_classes = ['nobreak'] 635
636 -def word_break(el, max_width=40, 637 avoid_elements=_avoid_word_break_elements, 638 avoid_classes=_avoid_word_break_classes, 639 break_character=unichr(0x200b)):
640 """ 641 Breaks any long words found in the body of the text (not attributes). 642 643 Doesn't effect any of the tags in avoid_elements, by default 644 ``<textarea>`` and ``<pre>`` 645 646 Breaks words by inserting &#8203;, which is a unicode character 647 for Zero Width Space character. This generally takes up no space 648 in rendering, but does copy as a space, and in monospace contexts 649 usually takes up space. 650 651 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 652 """ 653 # Character suggestion of &#8203 comes from: 654 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 655 if el.tag in _avoid_word_break_elements: 656 return 657 class_name = el.get('class') 658 if class_name: 659 dont_break = False 660 class_name = class_name.split() 661 for avoid in avoid_classes: 662 if avoid in class_name: 663 dont_break = True 664 break 665 if dont_break: 666 return 667 if el.text: 668 el.text = _break_text(el.text, max_width, break_character) 669 for child in el: 670 word_break(child, max_width=max_width, 671 avoid_elements=avoid_elements, 672 avoid_classes=avoid_classes, 673 break_character=break_character) 674 if child.tail: 675 child.tail = _break_text(child.tail, max_width, break_character)
676
677 -def word_break_html(html, *args, **kw):
678 result_type = type(html) 679 doc = fromstring(html) 680 word_break(doc, *args, **kw) 681 return _transform_result(result_type, doc)
682
683 -def _break_text(text, max_width, break_character):
684 words = text.split() 685 for word in words: 686 if len(word) > max_width: 687 replacement = _insert_break(word, max_width, break_character) 688 text = text.replace(word, replacement) 689 return text
690 691 _break_prefer_re = re.compile(r'[^a-z]', re.I) 692
693 -def _insert_break(word, width, break_character):
694 orig_word = word 695 result = '' 696 while len(word) > width: 697 start = word[:width] 698 breaks = list(_break_prefer_re.finditer(start)) 699 if breaks: 700 last_break = breaks[-1] 701 # Only walk back up to 10 characters to find a nice break: 702 if last_break.end() > width-10: 703 # FIXME: should the break character be at the end of the 704 # chunk, or the beginning of the next chunk? 705 start = word[:last_break.end()] 706 result += start + break_character 707 word = word[len(start):] 708 result += word 709 return result
710