Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  import re 
  2  import urlparse 
  3  from lxml import etree 
  4  from lxml.html import defs 
  5  from lxml.html import fromstring, tostring 
  6   
  7  try: 
  8      set 
  9  except NameError: 
 10      from sets import Set as set 
 11   
 12  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 13             'word_break', 'word_break_html'] 
 14   
 15  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 16  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 17  # I have multiple kinds of schemes searched; but should schemes be 
 18  #   whitelisted instead? 
 19  # max height? 
 20  # remove images?  Also in CSS?  background attribute? 
 21  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 22  #   allow *just* embedded YouTube movies) 
 23  # Log what was deleted and why? 
 24  # style="behavior: ..." might be bad in IE? 
 25  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 26  #   metas. 
 27  # UTF-7 detections?  Example: 
 28  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 29  #   you don't always have to have the charset set, if the page has no charset 
 30  #   and there's UTF7-like code in it. 
 31  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 32   
 33   
 34  # This is an IE-specific construct you can have in a stylesheet to 
 35  # run some Javascript: 
 36  _css_javascript_re = re.compile( 
 37      r'expression\s*\(.*?\)', re.S|re.I) 
 38   
 39  # Do I have to worry about @\nimport? 
 40  _css_import_re = re.compile( 
 41      r'@\s*import', re.I) 
 42   
 43  # All kinds of schemes besides just javascript: that can cause 
 44  # execution: 
 45  _javascript_scheme_re = re.compile( 
 46      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 47  _whitespace_re = re.compile(r'\s+') 
 48  # FIXME: should data: be blocked? 
 49   
 50  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 51  _conditional_comment_re = re.compile( 
 52      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 53   
 54  _find_styled_elements = etree.XPath( 
 55      "descendant-or-self::*[@style]") 
 56   
 57  _find_external_links = etree.XPath( 
 58      "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") 
 59   
60 -def clean_html(html, **kw):
61 """ 62 Like clean(), but takes a text input document, and returns a text 63 document. 64 """ 65 doc = fromstring(html) 66 clean(doc, **kw) 67 return tostring(doc)
68
69 -class Cleaner(object):
70 """ 71 Instances cleans the document of each of the possible offending 72 elements. The cleaning is controlled by attributes; you can 73 override attributes in a subclass, or set them in the constructor. 74 75 ``scripts``: 76 Removes any ``<script>`` tags. 77 78 ``javascript``: 79 Removes any Javascript, like an ``onclick`` attribute. 80 81 ``comments``: 82 Removes any comments. 83 84 ``style``: 85 Removes any style tags or attributes. 86 87 ``links``: 88 Removes any ``<link>`` tags 89 90 ``meta``: 91 Removes any ``<meta>`` tags 92 93 ``page_structure``: 94 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 95 96 ``processing_instructions``: 97 Removes any processing instructions. 98 99 ``embedded``: 100 Removes any embedded objects (flash, iframes) 101 102 ``frames``: 103 Removes any frame-related tags 104 105 ``forms``: 106 Removes any form tags 107 108 ``annoying_tags``: 109 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 110 111 ``remove_tags``: 112 A list of tags to remove. 113 114 ``allow_tags``: 115 A list of tags to include (default include all). 116 117 ``remove_unknown_tags``: 118 Remove any tags that aren't standard parts of HTML. 119 120 ``safe_attrs_only``: 121 If true, only include 'safe' attributes (specifically the list 122 from `feedparser 123 <http://feedparser.org/docs/html-sanitization.html>`_). 124 125 ``add_nofollow``: 126 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 127 128 ``host_whitelist``: 129 A list or set of hosts that you can use for embedded content 130 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 131 You can also implement/override the method 132 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 133 implement more complex rules for what can be embedded. 134 Anything that passes this test will be shown, regardless of 135 the value of (for instance) ``embedded``. 136 137 Note that this parameter might not work as intended if you do not 138 make the links absolute before doing the cleaning. 139 140 ``whitelist_tags``: 141 A set of tags that can be included with ``host_whitelist``. 142 The default is ``iframe`` and ``embed``; you may wish to 143 include other tags like ``script``, or you may want to 144 implement ``allow_embedded_url`` for more control. Set to None to 145 include all tags. 146 147 This modifies the document *in place*. 148 """ 149 150 scripts = True 151 javascript = True 152 comments = True 153 style = False 154 links = True 155 meta = True 156 page_structure = True 157 processing_instructions = True 158 embedded = True 159 frames = True 160 forms = True 161 annoying_tags = True 162 remove_tags = None 163 allow_tags = None 164 remove_unknown_tags = True 165 safe_attrs_only = True 166 add_nofollow = False 167 host_whitelist = () 168 whitelist_tags = set(['iframe', 'embed']) 169
170 - def __init__(self, **kw):
171 for name, value in kw.items(): 172 if not hasattr(self, name): 173 raise TypeError( 174 "Unknown parameter: %s=%r" % (name, value)) 175 setattr(self, name, value)
176 177 # Used to lookup the primary URL for a given tag that is up for 178 # removal: 179 _tag_link_attrs = dict( 180 script='src', 181 link='href', 182 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 183 # From what I can tell, both attributes can contain a link: 184 applet=['code', 'object'], 185 iframe='src', 186 embed='src', 187 layer='src', 188 # FIXME: there doesn't really seem like a general way to figure out what 189 # links an <object> tag uses; links often go in <param> tags with values 190 # that we don't really know. You'd have to have knowledge about specific 191 # kinds of plugins (probably keyed off classid), and match against those. 192 ##object=?, 193 # FIXME: not looking at the action currently, because it is more complex 194 # than than -- if you keep the form, you should keep the form controls. 195 ##form='action', 196 a='href', 197 ) 198
199 - def __call__(self, doc):
200 """ 201 Cleans the document. 202 """ 203 if hasattr(doc, 'getroot'): 204 # ElementTree instance, instead of an element 205 doc = doc.getroot() 206 # Normalize a case that IE treats <image> like <img>, and that 207 # can confuse either this step or later steps. 208 for el in doc.getiterator('image'): 209 el.tag = 'img' 210 if not self.comments: 211 # Of course, if we were going to kill comments anyway, we don't 212 # need to worry about this 213 self.kill_conditional_comments(doc) 214 kill_tags = set() 215 remove_tags = set(self.remove_tags or ()) 216 if self.allow_tags: 217 allow_tags = set(self.allow_tags) 218 else: 219 allow_tags = set() 220 if self.scripts: 221 kill_tags.add('script') 222 if self.safe_attrs_only: 223 safe_attrs = set(defs.safe_attrs) 224 for el in doc.getiterator(): 225 attrib = el.attrib 226 for aname in attrib.keys(): 227 if aname not in safe_attrs: 228 del attrib[aname] 229 if self.javascript: 230 if not self.safe_attrs_only: 231 # safe_attrs handles events attributes itself 232 for el in doc.getiterator(): 233 attrib = el.attrib 234 for aname in attrib.keys(): 235 if aname.startswith('on'): 236 del attrib[aname] 237 doc.rewrite_links(self._remove_javascript_link, 238 resolve_base_href=False) 239 if not self.style: 240 # If we're deleting style then we don't have to remove JS links 241 # from styles, otherwise... 242 for el in _find_styled_elements(doc): 243 old = el.get('style') 244 new = _css_javascript_re.sub('', old) 245 new = _css_import_re.sub('', old) 246 if self._has_sneaky_javascript(new): 247 # Something tricky is going on... 248 del el.attrib['style'] 249 elif new != old: 250 el.set('style', new) 251 for el in list(doc.getiterator('style')): 252 if el.get('type', '').lower().strip() == 'text/javascript': 253 el.drop_tree() 254 continue 255 old = el.text or '' 256 new = _css_javascript_re.sub('', old) 257 # The imported CSS can do anything; we just can't allow: 258 new = _css_import_re.sub('', old) 259 if self._has_sneaky_javascript(new): 260 # Something tricky is going on... 261 el.text = '/* deleted */' 262 elif new != old: 263 el.text = new 264 if self.comments or self.processing_instructions: 265 # FIXME: why either? I feel like there's some obscure reason 266 # because you can put PIs in comments...? But I've already 267 # forgotten it 268 kill_tags.add(etree.Comment) 269 if self.processing_instructions: 270 kill_tags.add(etree.ProcessingInstruction) 271 if self.style: 272 kill_tags.add('style') 273 for el in _find_styled_elements(doc): 274 del el.attrib['style'] 275 if self.links: 276 kill_tags.add('link') 277 elif self.style or self.javascript: 278 # We must get rid of included stylesheets if Javascript is not 279 # allowed, as you can put Javascript in them 280 for el in list(doc.getiterator('link')): 281 if 'stylesheet' in el.get('rel', '').lower(): 282 # Note this kills alternate stylesheets as well 283 el.drop_tree() 284 if self.meta: 285 kill_tags.add('meta') 286 if self.page_structure: 287 remove_tags.update(('head', 'html', 'title')) 288 if self.embedded: 289 # FIXME: is <layer> really embedded? 290 # We should get rid of any <param> tags not inside <applet>; 291 # These are not really valid anyway. 292 for el in list(doc.getiterator('param')): 293 found_parent = False 294 parent = el.getparent() 295 while parent is not None and parent.tag not in ('applet', 'object'): 296 parent = parent.getparent() 297 if parent is None: 298 el.drop_tree() 299 kill_tags.update(('applet',)) 300 # The alternate contents that are in an iframe are a good fallback: 301 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 302 if self.frames: 303 # FIXME: ideally we should look at the frame links, but 304 # generally frames don't mix properly with an HTML 305 # fragment anyway. 306 kill_tags.update(defs.frame_tags) 307 if self.forms: 308 remove_tags.add('form') 309 kill_tags.update(('button', 'input', 'select', 'textarea')) 310 if self.annoying_tags: 311 remove_tags.update(('blink', 'marque')) 312 313 _remove = [] 314 _kill = [] 315 for el in doc.getiterator(): 316 if el.tag in kill_tags: 317 if self.allow_element(el): 318 continue 319 _kill.append(el) 320 elif el.tag in remove_tags: 321 if self.allow_element(el): 322 continue 323 _remove.append(el) 324 325 if _remove and _remove[0] == doc: 326 # We have to drop the parent-most tag, which we can't 327 # do. Instead we'll rewrite it: 328 el = _remove.pop(0) 329 el.tag = 'div' 330 el.attrib.clear() 331 elif _kill and _kill[0] == doc: 332 # We have to drop the parent-most element, which we can't 333 # do. Instead we'll clear it: 334 el = _kill.pop(0) 335 if el.tag != 'html': 336 el.tag = 'div' 337 el.clear() 338 339 for el in _kill: 340 el.drop_tree() 341 for el in _remove: 342 el.drop_tag() 343 344 allow_tags = self.allow_tags 345 if self.remove_unknown_tags: 346 if allow_tags: 347 raise ValueError( 348 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 349 allow_tags = set(defs.tags) 350 if allow_tags: 351 bad = [] 352 for el in doc.getiterator(): 353 if el.tag not in allow_tags: 354 bad.append(el) 355 for el in bad: 356 el.drop_tag() 357 if self.add_nofollow: 358 for el in _find_external_links(doc): 359 if not self.allow_follow(el): 360 el.set('rel', 'nofollow')
361
362 - def allow_follow(self, anchor):
363 """ 364 Override to suppress rel="nofollow" on some anchors. 365 """ 366 return False
367
368 - def allow_element(self, el):
369 if el.tag not in self._tag_link_attrs: 370 return False 371 attr = self._tag_link_attrs[el.tag] 372 if isinstance(attr, (list, tuple)): 373 for one_attr in attr: 374 url = el.get(one_attr) 375 if not url: 376 return False 377 if not self.allow_embedded_url(el, url): 378 return False 379 return True 380 else: 381 url = el.get(attr) 382 if not url: 383 return False 384 return self.allow_embedded_url(el, url)
385
386 - def allow_embedded_url(self, el, url):
387 if (self.whitelist_tags is not None 388 and el.tag not in self.whitelist_tags): 389 return False 390 scheme, netloc, path, query, fragment = urlparse.urlsplit(url) 391 netloc = netloc.lower().split(':', 1)[0] 392 if scheme not in ('http', 'https'): 393 return False 394 if netloc in self.host_whitelist: 395 return True 396 return False
397
398 - def kill_conditional_comments(self, doc):
399 """ 400 IE conditional comments basically embed HTML that the parser 401 doesn't normally see. We can't allow anything like that, so 402 we'll kill any comments that could be conditional. 403 """ 404 bad = [] 405 self._kill_elements( 406 doc, lambda el: _conditional_comment_re.search(el.text), 407 etree.Comment)
408
409 - def _kill_elements(self, doc, condition, iterate=None):
410 bad = [] 411 for el in doc.getiterator(iterate): 412 if condition(el): 413 bad.append(el) 414 for el in bad: 415 el.drop_tree()
416 424 425 _decomment_re = re.compile(r'/\*.*?\*/', re.S) 426
427 - def _has_sneaky_javascript(self, style):
428 """ 429 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 430 can get interpreted, or ``expre/* stuff */ssion(...)``. This 431 checks for attempt to do stuff like this. 432 433 Typically the response will be to kill the entire style; if you 434 have just a bit of Javascript in the style another rule will catch 435 that and remove only the Javascript from the style; this catches 436 more sneaky attempts. 437 """ 438 style = self._decomment_re.sub('', style) 439 style = style.replace('\\', '') 440 style = _whitespace_re.sub('', style) 441 style = style.lower() 442 if 'javascript:' in style: 443 return True 444 if 'expression(' in style: 445 return True 446 return False
447
448 - def clean_html(self, html):
449 if isinstance(html, basestring): 450 return_string = True 451 doc = fromstring(html) 452 else: 453 return_string = False 454 doc = copy.deepcopy(html) 455 self(doc) 456 if return_string: 457 return tostring(doc) 458 else: 459 return doc
460 461 clean = Cleaner() 462 clean_html = clean.clean_html 463 464 ############################################################ 465 ## Autolinking 466 ############################################################ 467 468 _link_regexes = [ 469 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), 470 # This is conservative, but autolinking can be a bit conservative: 471 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 472 ] 473 474 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 475 476 _avoid_hosts = [ 477 re.compile(r'^localhost', re.I), 478 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 479 re.compile(r'^127\.0\.0\.1$'), 480 ] 481 482 _avoid_classes = ['nolink'] 483 528 586 599 600 autolink_html.__doc__ = autolink.__doc__ 601 602 ############################################################ 603 ## Word wrapping 604 ############################################################ 605 606 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 607 _avoid_word_break_classes = ['nobreak'] 608
609 -def word_break(el, max_width=40, 610 avoid_elements=_avoid_word_break_elements, 611 avoid_classes=_avoid_word_break_classes, 612 break_character=u'\u200b'):
613 """ 614 Breaks any long words found in the body of the text (not attributes). 615 616 Doesn't effect any of the tags in avoid_elements, by default 617 ``<textarea>`` and ``<pre>`` 618 619 Breaks words by inserting &#8203;, which is a unicode character 620 for Zero Width Space character. This generally takes up no space 621 in rendering, but does copy as a space, and in monospace contexts 622 usually takes up space. 623 624 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 625 """ 626 # Character suggestion of &#8203 comes from: 627 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 628 if el.tag in _avoid_word_break_elements: 629 return 630 class_name = el.get('class') 631 if class_name: 632 dont_break = False 633 class_name = class_name.split() 634 for avoid in avoid_classes: 635 if avoid in class_name: 636 dont_break = True 637 break 638 if dont_break: 639 return 640 if el.text: 641 el.text = _break_text(el.text, max_width, break_character) 642 for child in el: 643 word_break(child, max_width=max_width, 644 avoid_elements=avoid_elements, 645 avoid_classes=avoid_classes, 646 break_character=break_character) 647 if child.tail: 648 child.tail = _break_text(child.tail, max_width, break_character)
649
650 -def word_break_html(html, *args, **kw):
651 doc = fromstring(html) 652 word_break(doc, *args, **kw) 653 return tostring(doc)
654
655 -def _break_text(text, max_width, break_character):
656 words = text.split() 657 for word in words: 658 if len(word) > max_width: 659 replacement = _insert_break(word, max_width, break_character) 660 text = text.replace(word, replacement) 661 return text
662 663 _break_prefer_re = re.compile(r'[^a-z]', re.I) 664
665 -def _insert_break(word, width, break_character):
666 orig_word = word 667 result = '' 668 while len(word) > width: 669 start = word[:width] 670 breaks = list(_break_prefer_re.finditer(start)) 671 if breaks: 672 last_break = breaks[-1] 673 # Only walk back up to 10 characters to find a nice break: 674 if last_break.end() > width-10: 675 # FIXME: should the break character be at the end of the 676 # chunk, or the beginning of the next chunk? 677 start = word[:last_break.end()] 678 result += start + break_character 679 word = word[len(start):] 680 result += word 681 return result
682