Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22      # Python 3 
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27      # Python 3 
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32      # Python < 2.6 
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 44  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 45  # I have multiple kinds of schemes searched; but should schemes be 
 46  #   whitelisted instead? 
 47  # max height? 
 48  # remove images?  Also in CSS?  background attribute? 
 49  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 50  #   allow *just* embedded YouTube movies) 
 51  # Log what was deleted and why? 
 52  # style="behavior: ..." might be bad in IE? 
 53  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 54  #   metas. 
 55  # UTF-7 detections?  Example: 
 56  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 57  #   you don't always have to have the charset set, if the page has no charset 
 58  #   and there's UTF7-like code in it. 
 59  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 60   
 61   
 62  # This is an IE-specific construct you can have in a stylesheet to 
 63  # run some Javascript: 
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67  # Do I have to worry about @\nimport? 
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71  # All kinds of schemes besides just javascript: that can cause 
 72  # execution: 
 73  _is_javascript_scheme = re.compile( 
 74      r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', 
 75      re.I).search 
 76  _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub 
 77  # FIXME: should data: be blocked? 
 78   
 79  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 80  _conditional_comment_re = re.compile( 
 81      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 82   
 83  _find_styled_elements = etree.XPath( 
 84      "descendant-or-self::*[@style]") 
 85   
 86  _find_external_links = etree.XPath( 
 87      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 88       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 89      namespaces={'x':XHTML_NAMESPACE}) 
 90   
91 -class Cleaner(object):
92 """ 93 Instances cleans the document of each of the possible offending 94 elements. The cleaning is controlled by attributes; you can 95 override attributes in a subclass, or set them in the constructor. 96 97 ``scripts``: 98 Removes any ``<script>`` tags. 99 100 ``javascript``: 101 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 102 as they could contain Javascript. 103 104 ``comments``: 105 Removes any comments. 106 107 ``style``: 108 Removes any style tags or attributes. 109 110 ``links``: 111 Removes any ``<link>`` tags 112 113 ``meta``: 114 Removes any ``<meta>`` tags 115 116 ``page_structure``: 117 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 118 119 ``processing_instructions``: 120 Removes any processing instructions. 121 122 ``embedded``: 123 Removes any embedded objects (flash, iframes) 124 125 ``frames``: 126 Removes any frame-related tags 127 128 ``forms``: 129 Removes any form tags 130 131 ``annoying_tags``: 132 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 133 134 ``remove_tags``: 135 A list of tags to remove. Only the tags will be removed, 136 their content will get pulled up into the parent tag. 137 138 ``kill_tags``: 139 A list of tags to kill. Killing also removes the tag's content, 140 i.e. the whole subtree, not just the tag itself. 141 142 ``allow_tags``: 143 A list of tags to include (default include all). 144 145 ``remove_unknown_tags``: 146 Remove any tags that aren't standard parts of HTML. 147 148 ``safe_attrs_only``: 149 If true, only include 'safe' attributes (specifically the list 150 from the feedparser HTML sanitisation web site). 151 152 ``safe_attrs``: 153 A set of attribute names to override the default list of attributes 154 considered 'safe' (when safe_attrs_only=True). 155 156 ``add_nofollow``: 157 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 158 159 ``host_whitelist``: 160 A list or set of hosts that you can use for embedded content 161 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 162 You can also implement/override the method 163 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 164 implement more complex rules for what can be embedded. 165 Anything that passes this test will be shown, regardless of 166 the value of (for instance) ``embedded``. 167 168 Note that this parameter might not work as intended if you do not 169 make the links absolute before doing the cleaning. 170 171 Note that you may also need to set ``whitelist_tags``. 172 173 ``whitelist_tags``: 174 A set of tags that can be included with ``host_whitelist``. 175 The default is ``iframe`` and ``embed``; you may wish to 176 include other tags like ``script``, or you may want to 177 implement ``allow_embedded_url`` for more control. Set to None to 178 include all tags. 179 180 This modifies the document *in place*. 181 """ 182 183 scripts = True 184 javascript = True 185 comments = True 186 style = False 187 links = True 188 meta = True 189 page_structure = True 190 processing_instructions = True 191 embedded = True 192 frames = True 193 forms = True 194 annoying_tags = True 195 remove_tags = None 196 allow_tags = None 197 kill_tags = None 198 remove_unknown_tags = True 199 safe_attrs_only = True 200 safe_attrs = defs.safe_attrs 201 add_nofollow = False 202 host_whitelist = () 203 whitelist_tags = set(['iframe', 'embed']) 204
205 - def __init__(self, **kw):
206 for name, value in kw.items(): 207 if not hasattr(self, name): 208 raise TypeError( 209 "Unknown parameter: %s=%r" % (name, value)) 210 setattr(self, name, value)
211 212 # Used to lookup the primary URL for a given tag that is up for 213 # removal: 214 _tag_link_attrs = dict( 215 script='src', 216 link='href', 217 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 218 # From what I can tell, both attributes can contain a link: 219 applet=['code', 'object'], 220 iframe='src', 221 embed='src', 222 layer='src', 223 # FIXME: there doesn't really seem like a general way to figure out what 224 # links an <object> tag uses; links often go in <param> tags with values 225 # that we don't really know. You'd have to have knowledge about specific 226 # kinds of plugins (probably keyed off classid), and match against those. 227 ##object=?, 228 # FIXME: not looking at the action currently, because it is more complex 229 # than than -- if you keep the form, you should keep the form controls. 230 ##form='action', 231 a='href', 232 ) 233
234 - def __call__(self, doc):
235 """ 236 Cleans the document. 237 """ 238 if hasattr(doc, 'getroot'): 239 # ElementTree instance, instead of an element 240 doc = doc.getroot() 241 # convert XHTML to HTML 242 xhtml_to_html(doc) 243 # Normalize a case that IE treats <image> like <img>, and that 244 # can confuse either this step or later steps. 245 for el in doc.iter('image'): 246 el.tag = 'img' 247 if not self.comments: 248 # Of course, if we were going to kill comments anyway, we don't 249 # need to worry about this 250 self.kill_conditional_comments(doc) 251 252 kill_tags = set(self.kill_tags or ()) 253 remove_tags = set(self.remove_tags or ()) 254 allow_tags = set(self.allow_tags or ()) 255 256 if self.scripts: 257 kill_tags.add('script') 258 if self.safe_attrs_only: 259 safe_attrs = set(self.safe_attrs) 260 for el in doc.iter(etree.Element): 261 attrib = el.attrib 262 for aname in attrib.keys(): 263 if aname not in safe_attrs: 264 del attrib[aname] 265 if self.javascript: 266 if not (self.safe_attrs_only and 267 self.safe_attrs == defs.safe_attrs): 268 # safe_attrs handles events attributes itself 269 for el in doc.iter(etree.Element): 270 attrib = el.attrib 271 for aname in attrib.keys(): 272 if aname.startswith('on'): 273 del attrib[aname] 274 doc.rewrite_links(self._remove_javascript_link, 275 resolve_base_href=False) 276 if not self.style: 277 # If we're deleting style then we don't have to remove JS links 278 # from styles, otherwise... 279 for el in _find_styled_elements(doc): 280 old = el.get('style') 281 new = _css_javascript_re.sub('', old) 282 new = _css_import_re.sub('', new) 283 if self._has_sneaky_javascript(new): 284 # Something tricky is going on... 285 del el.attrib['style'] 286 elif new != old: 287 el.set('style', new) 288 for el in list(doc.iter('style')): 289 if el.get('type', '').lower().strip() == 'text/javascript': 290 el.drop_tree() 291 continue 292 old = el.text or '' 293 new = _css_javascript_re.sub('', old) 294 # The imported CSS can do anything; we just can't allow: 295 new = _css_import_re.sub('', old) 296 if self._has_sneaky_javascript(new): 297 # Something tricky is going on... 298 el.text = '/* deleted */' 299 elif new != old: 300 el.text = new 301 if self.comments or self.processing_instructions: 302 # FIXME: why either? I feel like there's some obscure reason 303 # because you can put PIs in comments...? But I've already 304 # forgotten it 305 kill_tags.add(etree.Comment) 306 if self.processing_instructions: 307 kill_tags.add(etree.ProcessingInstruction) 308 if self.style: 309 kill_tags.add('style') 310 etree.strip_attributes(doc, 'style') 311 if self.links: 312 kill_tags.add('link') 313 elif self.style or self.javascript: 314 # We must get rid of included stylesheets if Javascript is not 315 # allowed, as you can put Javascript in them 316 for el in list(doc.iter('link')): 317 if 'stylesheet' in el.get('rel', '').lower(): 318 # Note this kills alternate stylesheets as well 319 if not self.allow_element(el): 320 el.drop_tree() 321 if self.meta: 322 kill_tags.add('meta') 323 if self.page_structure: 324 remove_tags.update(('head', 'html', 'title')) 325 if self.embedded: 326 # FIXME: is <layer> really embedded? 327 # We should get rid of any <param> tags not inside <applet>; 328 # These are not really valid anyway. 329 for el in list(doc.iter('param')): 330 found_parent = False 331 parent = el.getparent() 332 while parent is not None and parent.tag not in ('applet', 'object'): 333 parent = parent.getparent() 334 if parent is None: 335 el.drop_tree() 336 kill_tags.update(('applet',)) 337 # The alternate contents that are in an iframe are a good fallback: 338 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 339 if self.frames: 340 # FIXME: ideally we should look at the frame links, but 341 # generally frames don't mix properly with an HTML 342 # fragment anyway. 343 kill_tags.update(defs.frame_tags) 344 if self.forms: 345 remove_tags.add('form') 346 kill_tags.update(('button', 'input', 'select', 'textarea')) 347 if self.annoying_tags: 348 remove_tags.update(('blink', 'marquee')) 349 350 _remove = [] 351 _kill = [] 352 for el in doc.iter(): 353 if el.tag in kill_tags: 354 if self.allow_element(el): 355 continue 356 _kill.append(el) 357 elif el.tag in remove_tags: 358 if self.allow_element(el): 359 continue 360 _remove.append(el) 361 362 if _remove and _remove[0] == doc: 363 # We have to drop the parent-most tag, which we can't 364 # do. Instead we'll rewrite it: 365 el = _remove.pop(0) 366 el.tag = 'div' 367 el.attrib.clear() 368 elif _kill and _kill[0] == doc: 369 # We have to drop the parent-most element, which we can't 370 # do. Instead we'll clear it: 371 el = _kill.pop(0) 372 if el.tag != 'html': 373 el.tag = 'div' 374 el.clear() 375 376 _kill.reverse() # start with innermost tags 377 for el in _kill: 378 el.drop_tree() 379 for el in _remove: 380 el.drop_tag() 381 382 if self.remove_unknown_tags: 383 if allow_tags: 384 raise ValueError( 385 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 386 allow_tags = set(defs.tags) 387 if allow_tags: 388 bad = [] 389 for el in doc.iter(): 390 if el.tag not in allow_tags: 391 bad.append(el) 392 if bad: 393 if bad[0] is doc: 394 el = bad.pop(0) 395 el.tag = 'div' 396 el.attrib.clear() 397 for el in bad: 398 el.drop_tag() 399 if self.add_nofollow: 400 for el in _find_external_links(doc): 401 if not self.allow_follow(el): 402 rel = el.get('rel') 403 if rel: 404 if ('nofollow' in rel 405 and ' nofollow ' in (' %s ' % rel)): 406 continue 407 rel = '%s nofollow' % rel 408 else: 409 rel = 'nofollow' 410 el.set('rel', rel)
411
412 - def allow_follow(self, anchor):
413 """ 414 Override to suppress rel="nofollow" on some anchors. 415 """ 416 return False
417
418 - def allow_element(self, el):
419 if el.tag not in self._tag_link_attrs: 420 return False 421 attr = self._tag_link_attrs[el.tag] 422 if isinstance(attr, (list, tuple)): 423 for one_attr in attr: 424 url = el.get(one_attr) 425 if not url: 426 return False 427 if not self.allow_embedded_url(el, url): 428 return False 429 return True 430 else: 431 url = el.get(attr) 432 if not url: 433 return False 434 return self.allow_embedded_url(el, url)
435
436 - def allow_embedded_url(self, el, url):
437 if (self.whitelist_tags is not None 438 and el.tag not in self.whitelist_tags): 439 return False 440 scheme, netloc, path, query, fragment = urlsplit(url) 441 netloc = netloc.lower().split(':', 1)[0] 442 if scheme not in ('http', 'https'): 443 return False 444 if netloc in self.host_whitelist: 445 return True 446 return False
447
448 - def kill_conditional_comments(self, doc):
449 """ 450 IE conditional comments basically embed HTML that the parser 451 doesn't normally see. We can't allow anything like that, so 452 we'll kill any comments that could be conditional. 453 """ 454 bad = [] 455 self._kill_elements( 456 doc, lambda el: _conditional_comment_re.search(el.text), 457 etree.Comment)
458
459 - def _kill_elements(self, doc, condition, iterate=None):
460 bad = [] 461 for el in doc.iter(iterate): 462 if condition(el): 463 bad.append(el) 464 for el in bad: 465 el.drop_tree()
466 474 475 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 476
477 - def _has_sneaky_javascript(self, style):
478 """ 479 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 480 can get interpreted, or ``expre/* stuff */ssion(...)``. This 481 checks for attempt to do stuff like this. 482 483 Typically the response will be to kill the entire style; if you 484 have just a bit of Javascript in the style another rule will catch 485 that and remove only the Javascript from the style; this catches 486 more sneaky attempts. 487 """ 488 style = self._substitute_comments('', style) 489 style = style.replace('\\', '') 490 style = _substitute_whitespace('', style) 491 style = style.lower() 492 if 'javascript:' in style: 493 return True 494 if 'expression(' in style: 495 return True 496 return False
497
498 - def clean_html(self, html):
499 result_type = type(html) 500 if isinstance(html, basestring): 501 doc = fromstring(html) 502 else: 503 doc = copy.deepcopy(html) 504 self(doc) 505 return _transform_result(result_type, doc)
506 507 clean = Cleaner() 508 clean_html = clean.clean_html 509 510 ############################################################ 511 ## Autolinking 512 ############################################################ 513 514 _link_regexes = [ 515 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 516 # This is conservative, but autolinking can be a bit conservative: 517 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 518 ] 519 520 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 521 522 _avoid_hosts = [ 523 re.compile(r'^localhost', re.I), 524 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 525 re.compile(r'^127\.0\.0\.1$'), 526 ] 527 528 _avoid_classes = ['nolink'] 529 574 632 641 642 autolink_html.__doc__ = autolink.__doc__ 643 644 ############################################################ 645 ## Word wrapping 646 ############################################################ 647 648 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 649 _avoid_word_break_classes = ['nobreak'] 650
651 -def word_break(el, max_width=40, 652 avoid_elements=_avoid_word_break_elements, 653 avoid_classes=_avoid_word_break_classes, 654 break_character=unichr(0x200b)):
655 """ 656 Breaks any long words found in the body of the text (not attributes). 657 658 Doesn't effect any of the tags in avoid_elements, by default 659 ``<textarea>`` and ``<pre>`` 660 661 Breaks words by inserting &#8203;, which is a unicode character 662 for Zero Width Space character. This generally takes up no space 663 in rendering, but does copy as a space, and in monospace contexts 664 usually takes up space. 665 666 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 667 """ 668 # Character suggestion of &#8203 comes from: 669 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 670 if el.tag in _avoid_word_break_elements: 671 return 672 class_name = el.get('class') 673 if class_name: 674 dont_break = False 675 class_name = class_name.split() 676 for avoid in avoid_classes: 677 if avoid in class_name: 678 dont_break = True 679 break 680 if dont_break: 681 return 682 if el.text: 683 el.text = _break_text(el.text, max_width, break_character) 684 for child in el: 685 word_break(child, max_width=max_width, 686 avoid_elements=avoid_elements, 687 avoid_classes=avoid_classes, 688 break_character=break_character) 689 if child.tail: 690 child.tail = _break_text(child.tail, max_width, break_character)
691
692 -def word_break_html(html, *args, **kw):
693 result_type = type(html) 694 doc = fromstring(html) 695 word_break(doc, *args, **kw) 696 return _transform_result(result_type, doc)
697
698 -def _break_text(text, max_width, break_character):
699 words = text.split() 700 for word in words: 701 if len(word) > max_width: 702 replacement = _insert_break(word, max_width, break_character) 703 text = text.replace(word, replacement) 704 return text
705 706 _break_prefer_re = re.compile(r'[^a-z]', re.I) 707
708 -def _insert_break(word, width, break_character):
709 orig_word = word 710 result = '' 711 while len(word) > width: 712 start = word[:width] 713 breaks = list(_break_prefer_re.finditer(start)) 714 if breaks: 715 last_break = breaks[-1] 716 # Only walk back up to 10 characters to find a nice break: 717 if last_break.end() > width-10: 718 # FIXME: should the break character be at the end of the 719 # chunk, or the beginning of the next chunk? 720 start = word[:last_break.end()] 721 result += start + break_character 722 word = word[len(start):] 723 result += word 724 return result
725