Package lxml :: Package html :: Module clean
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  import urlparse 
 10  from lxml import etree 
 11  from lxml.html import defs 
 12  from lxml.html import fromstring, tostring 
 13   
 14  try: 
 15      set 
 16  except NameError: 
 17      from sets import Set as set 
 18   
 19  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 20             'word_break', 'word_break_html'] 
 21   
 22  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 23  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 24  # I have multiple kinds of schemes searched; but should schemes be 
 25  #   whitelisted instead? 
 26  # max height? 
 27  # remove images?  Also in CSS?  background attribute? 
 28  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 29  #   allow *just* embedded YouTube movies) 
 30  # Log what was deleted and why? 
 31  # style="behavior: ..." might be bad in IE? 
 32  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 33  #   metas. 
 34  # UTF-7 detections?  Example: 
 35  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 36  #   you don't always have to have the charset set, if the page has no charset 
 37  #   and there's UTF7-like code in it. 
 38  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 39   
 40   
 41  # This is an IE-specific construct you can have in a stylesheet to 
 42  # run some Javascript: 
 43  _css_javascript_re = re.compile( 
 44      r'expression\s*\(.*?\)', re.S|re.I) 
 45   
 46  # Do I have to worry about @\nimport? 
 47  _css_import_re = re.compile( 
 48      r'@\s*import', re.I) 
 49   
 50  # All kinds of schemes besides just javascript: that can cause 
 51  # execution: 
 52  _javascript_scheme_re = re.compile( 
 53      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 54  _substitute_whitespace = re.compile(r'\s+').sub 
 55  # FIXME: should data: be blocked? 
 56   
 57  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 58  _conditional_comment_re = re.compile( 
 59      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 60   
 61  _find_styled_elements = etree.XPath( 
 62      "descendant-or-self::*[@style]") 
 63   
 64  _find_external_links = etree.XPath( 
 65      "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") 
 66   
67 -class Cleaner(object):
68 """ 69 Instances cleans the document of each of the possible offending 70 elements. The cleaning is controlled by attributes; you can 71 override attributes in a subclass, or set them in the constructor. 72 73 ``scripts``: 74 Removes any ``<script>`` tags. 75 76 ``javascript``: 77 Removes any Javascript, like an ``onclick`` attribute. 78 79 ``comments``: 80 Removes any comments. 81 82 ``style``: 83 Removes any style tags or attributes. 84 85 ``links``: 86 Removes any ``<link>`` tags 87 88 ``meta``: 89 Removes any ``<meta>`` tags 90 91 ``page_structure``: 92 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 93 94 ``processing_instructions``: 95 Removes any processing instructions. 96 97 ``embedded``: 98 Removes any embedded objects (flash, iframes) 99 100 ``frames``: 101 Removes any frame-related tags 102 103 ``forms``: 104 Removes any form tags 105 106 ``annoying_tags``: 107 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 108 109 ``remove_tags``: 110 A list of tags to remove. 111 112 ``allow_tags``: 113 A list of tags to include (default include all). 114 115 ``remove_unknown_tags``: 116 Remove any tags that aren't standard parts of HTML. 117 118 ``safe_attrs_only``: 119 If true, only include 'safe' attributes (specifically the list 120 from `feedparser 121 <http://feedparser.org/docs/html-sanitization.html>`_). 122 123 ``add_nofollow``: 124 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 125 126 ``host_whitelist``: 127 A list or set of hosts that you can use for embedded content 128 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 129 You can also implement/override the method 130 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 131 implement more complex rules for what can be embedded. 132 Anything that passes this test will be shown, regardless of 133 the value of (for instance) ``embedded``. 134 135 Note that this parameter might not work as intended if you do not 136 make the links absolute before doing the cleaning. 137 138 ``whitelist_tags``: 139 A set of tags that can be included with ``host_whitelist``. 140 The default is ``iframe`` and ``embed``; you may wish to 141 include other tags like ``script``, or you may want to 142 implement ``allow_embedded_url`` for more control. Set to None to 143 include all tags. 144 145 This modifies the document *in place*. 146 """ 147 148 scripts = True 149 javascript = True 150 comments = True 151 style = False 152 links = True 153 meta = True 154 page_structure = True 155 processing_instructions = True 156 embedded = True 157 frames = True 158 forms = True 159 annoying_tags = True 160 remove_tags = None 161 allow_tags = None 162 remove_unknown_tags = True 163 safe_attrs_only = True 164 add_nofollow = False 165 host_whitelist = () 166 whitelist_tags = set(['iframe', 'embed']) 167
168 - def __init__(self, **kw):
169 for name, value in kw.items(): 170 if not hasattr(self, name): 171 raise TypeError( 172 "Unknown parameter: %s=%r" % (name, value)) 173 setattr(self, name, value)
174 175 # Used to lookup the primary URL for a given tag that is up for 176 # removal: 177 _tag_link_attrs = dict( 178 script='src', 179 link='href', 180 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 181 # From what I can tell, both attributes can contain a link: 182 applet=['code', 'object'], 183 iframe='src', 184 embed='src', 185 layer='src', 186 # FIXME: there doesn't really seem like a general way to figure out what 187 # links an <object> tag uses; links often go in <param> tags with values 188 # that we don't really know. You'd have to have knowledge about specific 189 # kinds of plugins (probably keyed off classid), and match against those. 190 ##object=?, 191 # FIXME: not looking at the action currently, because it is more complex 192 # than than -- if you keep the form, you should keep the form controls. 193 ##form='action', 194 a='href', 195 ) 196
197 - def __call__(self, doc):
198 """ 199 Cleans the document. 200 """ 201 if hasattr(doc, 'getroot'): 202 # ElementTree instance, instead of an element 203 doc = doc.getroot() 204 # Normalize a case that IE treats <image> like <img>, and that 205 # can confuse either this step or later steps. 206 for el in doc.iter('image'): 207 el.tag = 'img' 208 if not self.comments: 209 # Of course, if we were going to kill comments anyway, we don't 210 # need to worry about this 211 self.kill_conditional_comments(doc) 212 kill_tags = set() 213 remove_tags = set(self.remove_tags or ()) 214 if self.allow_tags: 215 allow_tags = set(self.allow_tags) 216 else: 217 allow_tags = set() 218 if self.scripts: 219 kill_tags.add('script') 220 if self.safe_attrs_only: 221 safe_attrs = set(defs.safe_attrs) 222 for el in doc.iter(): 223 attrib = el.attrib 224 for aname in attrib.keys(): 225 if aname not in safe_attrs: 226 del attrib[aname] 227 if self.javascript: 228 if not self.safe_attrs_only: 229 # safe_attrs handles events attributes itself 230 for el in doc.iter(): 231 attrib = el.attrib 232 for aname in attrib.keys(): 233 if aname.startswith('on'): 234 del attrib[aname] 235 doc.rewrite_links(self._remove_javascript_link, 236 resolve_base_href=False) 237 if not self.style: 238 # If we're deleting style then we don't have to remove JS links 239 # from styles, otherwise... 240 for el in _find_styled_elements(doc): 241 old = el.get('style') 242 new = _css_javascript_re.sub('', old) 243 new = _css_import_re.sub('', old) 244 if self._has_sneaky_javascript(new): 245 # Something tricky is going on... 246 del el.attrib['style'] 247 elif new != old: 248 el.set('style', new) 249 for el in list(doc.iter('style')): 250 if el.get('type', '').lower().strip() == 'text/javascript': 251 el.drop_tree() 252 continue 253 old = el.text or '' 254 new = _css_javascript_re.sub('', old) 255 # The imported CSS can do anything; we just can't allow: 256 new = _css_import_re.sub('', old) 257 if self._has_sneaky_javascript(new): 258 # Something tricky is going on... 259 el.text = '/* deleted */' 260 elif new != old: 261 el.text = new 262 if self.comments or self.processing_instructions: 263 # FIXME: why either? I feel like there's some obscure reason 264 # because you can put PIs in comments...? But I've already 265 # forgotten it 266 kill_tags.add(etree.Comment) 267 if self.processing_instructions: 268 kill_tags.add(etree.ProcessingInstruction) 269 if self.style: 270 kill_tags.add('style') 271 for el in _find_styled_elements(doc): 272 del el.attrib['style'] 273 if self.links: 274 kill_tags.add('link') 275 elif self.style or self.javascript: 276 # We must get rid of included stylesheets if Javascript is not 277 # allowed, as you can put Javascript in them 278 for el in list(doc.iter('link')): 279 if 'stylesheet' in el.get('rel', '').lower(): 280 # Note this kills alternate stylesheets as well 281 el.drop_tree() 282 if self.meta: 283 kill_tags.add('meta') 284 if self.page_structure: 285 remove_tags.update(('head', 'html', 'title')) 286 if self.embedded: 287 # FIXME: is <layer> really embedded? 288 # We should get rid of any <param> tags not inside <applet>; 289 # These are not really valid anyway. 290 for el in list(doc.iter('param')): 291 found_parent = False 292 parent = el.getparent() 293 while parent is not None and parent.tag not in ('applet', 'object'): 294 parent = parent.getparent() 295 if parent is None: 296 el.drop_tree() 297 kill_tags.update(('applet',)) 298 # The alternate contents that are in an iframe are a good fallback: 299 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 300 if self.frames: 301 # FIXME: ideally we should look at the frame links, but 302 # generally frames don't mix properly with an HTML 303 # fragment anyway. 304 kill_tags.update(defs.frame_tags) 305 if self.forms: 306 remove_tags.add('form') 307 kill_tags.update(('button', 'input', 'select', 'textarea')) 308 if self.annoying_tags: 309 remove_tags.update(('blink', 'marque')) 310 311 _remove = [] 312 _kill = [] 313 for el in doc.iter(): 314 if el.tag in kill_tags: 315 if self.allow_element(el): 316 continue 317 _kill.append(el) 318 elif el.tag in remove_tags: 319 if self.allow_element(el): 320 continue 321 _remove.append(el) 322 323 if _remove and _remove[0] == doc: 324 # We have to drop the parent-most tag, which we can't 325 # do. Instead we'll rewrite it: 326 el = _remove.pop(0) 327 el.tag = 'div' 328 el.attrib.clear() 329 elif _kill and _kill[0] == doc: 330 # We have to drop the parent-most element, which we can't 331 # do. Instead we'll clear it: 332 el = _kill.pop(0) 333 if el.tag != 'html': 334 el.tag = 'div' 335 el.clear() 336 337 for el in _kill: 338 el.drop_tree() 339 for el in _remove: 340 el.drop_tag() 341 342 allow_tags = self.allow_tags 343 if self.remove_unknown_tags: 344 if allow_tags: 345 raise ValueError( 346 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 347 allow_tags = set(defs.tags) 348 if allow_tags: 349 bad = [] 350 for el in doc.iter(): 351 if el.tag not in allow_tags: 352 bad.append(el) 353 for el in bad: 354 el.drop_tag() 355 if self.add_nofollow: 356 for el in _find_external_links(doc): 357 if not self.allow_follow(el): 358 el.set('rel', 'nofollow')
359
360 - def allow_follow(self, anchor):
361 """ 362 Override to suppress rel="nofollow" on some anchors. 363 """ 364 return False
365
366 - def allow_element(self, el):
367 if el.tag not in self._tag_link_attrs: 368 return False 369 attr = self._tag_link_attrs[el.tag] 370 if isinstance(attr, (list, tuple)): 371 for one_attr in attr: 372 url = el.get(one_attr) 373 if not url: 374 return False 375 if not self.allow_embedded_url(el, url): 376 return False 377 return True 378 else: 379 url = el.get(attr) 380 if not url: 381 return False 382 return self.allow_embedded_url(el, url)
383
384 - def allow_embedded_url(self, el, url):
385 if (self.whitelist_tags is not None 386 and el.tag not in self.whitelist_tags): 387 return False 388 scheme, netloc, path, query, fragment = urlparse.urlsplit(url) 389 netloc = netloc.lower().split(':', 1)[0] 390 if scheme not in ('http', 'https'): 391 return False 392 if netloc in self.host_whitelist: 393 return True 394 return False
395
396 - def kill_conditional_comments(self, doc):
397 """ 398 IE conditional comments basically embed HTML that the parser 399 doesn't normally see. We can't allow anything like that, so 400 we'll kill any comments that could be conditional. 401 """ 402 bad = [] 403 self._kill_elements( 404 doc, lambda el: _conditional_comment_re.search(el.text), 405 etree.Comment)
406
407 - def _kill_elements(self, doc, condition, iterate=None):
408 bad = [] 409 for el in doc.iter(iterate): 410 if condition(el): 411 bad.append(el) 412 for el in bad: 413 el.drop_tree()
414 422 423 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 424
425 - def _has_sneaky_javascript(self, style):
426 """ 427 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 428 can get interpreted, or ``expre/* stuff */ssion(...)``. This 429 checks for attempt to do stuff like this. 430 431 Typically the response will be to kill the entire style; if you 432 have just a bit of Javascript in the style another rule will catch 433 that and remove only the Javascript from the style; this catches 434 more sneaky attempts. 435 """ 436 style = self._substitute_comments('', style) 437 style = style.replace('\\', '') 438 style = _substitute_whitespace('', style) 439 style = style.lower() 440 if 'javascript:' in style: 441 return True 442 if 'expression(' in style: 443 return True 444 return False
445
446 - def clean_html(self, html):
447 if isinstance(html, basestring): 448 return_string = True 449 doc = fromstring(html) 450 else: 451 return_string = False 452 doc = copy.deepcopy(html) 453 self(doc) 454 if return_string: 455 return tostring(doc) 456 else: 457 return doc
458 459 clean = Cleaner() 460 clean_html = clean.clean_html 461 462 ############################################################ 463 ## Autolinking 464 ############################################################ 465 466 _link_regexes = [ 467 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), 468 # This is conservative, but autolinking can be a bit conservative: 469 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 470 ] 471 472 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 473 474 _avoid_hosts = [ 475 re.compile(r'^localhost', re.I), 476 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 477 re.compile(r'^127\.0\.0\.1$'), 478 ] 479 480 _avoid_classes = ['nolink'] 481 526 584 597 598 autolink_html.__doc__ = autolink.__doc__ 599 600 ############################################################ 601 ## Word wrapping 602 ############################################################ 603 604 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 605 _avoid_word_break_classes = ['nobreak'] 606
607 -def word_break(el, max_width=40, 608 avoid_elements=_avoid_word_break_elements, 609 avoid_classes=_avoid_word_break_classes, 610 break_character=u'\u200b'):
611 """ 612 Breaks any long words found in the body of the text (not attributes). 613 614 Doesn't effect any of the tags in avoid_elements, by default 615 ``<textarea>`` and ``<pre>`` 616 617 Breaks words by inserting &#8203;, which is a unicode character 618 for Zero Width Space character. This generally takes up no space 619 in rendering, but does copy as a space, and in monospace contexts 620 usually takes up space. 621 622 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 623 """ 624 # Character suggestion of &#8203 comes from: 625 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 626 if el.tag in _avoid_word_break_elements: 627 return 628 class_name = el.get('class') 629 if class_name: 630 dont_break = False 631 class_name = class_name.split() 632 for avoid in avoid_classes: 633 if avoid in class_name: 634 dont_break = True 635 break 636 if dont_break: 637 return 638 if el.text: 639 el.text = _break_text(el.text, max_width, break_character) 640 for child in el: 641 word_break(child, max_width=max_width, 642 avoid_elements=avoid_elements, 643 avoid_classes=avoid_classes, 644 break_character=break_character) 645 if child.tail: 646 child.tail = _break_text(child.tail, max_width, break_character)
647
648 -def word_break_html(html, *args, **kw):
649 doc = fromstring(html) 650 word_break(doc, *args, **kw) 651 return tostring(doc)
652
653 -def _break_text(text, max_width, break_character):
654 words = text.split() 655 for word in words: 656 if len(word) > max_width: 657 replacement = _insert_break(word, max_width, break_character) 658 text = text.replace(word, replacement) 659 return text
660 661 _break_prefer_re = re.compile(r'[^a-z]', re.I) 662
663 -def _insert_break(word, width, break_character):
664 orig_word = word 665 result = '' 666 while len(word) > width: 667 start = word[:width] 668 breaks = list(_break_prefer_re.finditer(start)) 669 if breaks: 670 last_break = breaks[-1] 671 # Only walk back up to 10 characters to find a nice break: 672 if last_break.end() > width-10: 673 # FIXME: should the break character be at the end of the 674 # chunk, or the beginning of the next chunk? 675 start = word[:last_break.end()] 676 result += start + break_character 677 word = word[len(start):] 678 result += word 679 return result
680