Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  import re 
  2  import copy 
  3  import urlparse 
  4  from lxml import etree 
  5  from lxml.html import defs 
  6  from lxml.html import fromstring, tostring 
  7   
  8  try: 
  9      set 
 10  except NameError: 
 11      from sets import Set as set 
 12   
 13  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 14             'word_break', 'word_break_html'] 
 15   
 16  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 17  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 18  # I have multiple kinds of schemes searched; but should schemes be 
 19  #   whitelisted instead? 
 20  # max height? 
 21  # remove images?  Also in CSS?  background attribute? 
 22  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 23  #   allow *just* embedded YouTube movies) 
 24  # Log what was deleted and why? 
 25  # style="behavior: ..." might be bad in IE? 
 26  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 27  #   metas. 
 28  # UTF-7 detections?  Example: 
 29  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 30  #   you don't always have to have the charset set, if the page has no charset 
 31  #   and there's UTF7-like code in it. 
 32  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 33   
 34   
 35  # This is an IE-specific construct you can have in a stylesheet to 
 36  # run some Javascript: 
 37  _css_javascript_re = re.compile( 
 38      r'expression\s*\(.*?\)', re.S|re.I) 
 39   
 40  # Do I have to worry about @\nimport? 
 41  _css_import_re = re.compile( 
 42      r'@\s*import', re.I) 
 43   
 44  # All kinds of schemes besides just javascript: that can cause 
 45  # execution: 
 46  _javascript_scheme_re = re.compile( 
 47      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 48  _substitute_whitespace = re.compile(r'\s+').sub 
 49  # FIXME: should data: be blocked? 
 50   
 51  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 52  _conditional_comment_re = re.compile( 
 53      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 54   
 55  _find_styled_elements = etree.XPath( 
 56      "descendant-or-self::*[@style]") 
 57   
 58  _find_external_links = etree.XPath( 
 59      "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") 
 60   
61 -class Cleaner(object):
62 """ 63 Instances cleans the document of each of the possible offending 64 elements. The cleaning is controlled by attributes; you can 65 override attributes in a subclass, or set them in the constructor. 66 67 ``scripts``: 68 Removes any ``<script>`` tags. 69 70 ``javascript``: 71 Removes any Javascript, like an ``onclick`` attribute. 72 73 ``comments``: 74 Removes any comments. 75 76 ``style``: 77 Removes any style tags or attributes. 78 79 ``links``: 80 Removes any ``<link>`` tags 81 82 ``meta``: 83 Removes any ``<meta>`` tags 84 85 ``page_structure``: 86 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 87 88 ``processing_instructions``: 89 Removes any processing instructions. 90 91 ``embedded``: 92 Removes any embedded objects (flash, iframes) 93 94 ``frames``: 95 Removes any frame-related tags 96 97 ``forms``: 98 Removes any form tags 99 100 ``annoying_tags``: 101 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 102 103 ``remove_tags``: 104 A list of tags to remove. 105 106 ``allow_tags``: 107 A list of tags to include (default include all). 108 109 ``remove_unknown_tags``: 110 Remove any tags that aren't standard parts of HTML. 111 112 ``safe_attrs_only``: 113 If true, only include 'safe' attributes (specifically the list 114 from `feedparser 115 <http://feedparser.org/docs/html-sanitization.html>`_). 116 117 ``add_nofollow``: 118 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 119 120 ``host_whitelist``: 121 A list or set of hosts that you can use for embedded content 122 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 123 You can also implement/override the method 124 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 125 implement more complex rules for what can be embedded. 126 Anything that passes this test will be shown, regardless of 127 the value of (for instance) ``embedded``. 128 129 Note that this parameter might not work as intended if you do not 130 make the links absolute before doing the cleaning. 131 132 ``whitelist_tags``: 133 A set of tags that can be included with ``host_whitelist``. 134 The default is ``iframe`` and ``embed``; you may wish to 135 include other tags like ``script``, or you may want to 136 implement ``allow_embedded_url`` for more control. Set to None to 137 include all tags. 138 139 This modifies the document *in place*. 140 """ 141 142 scripts = True 143 javascript = True 144 comments = True 145 style = False 146 links = True 147 meta = True 148 page_structure = True 149 processing_instructions = True 150 embedded = True 151 frames = True 152 forms = True 153 annoying_tags = True 154 remove_tags = None 155 allow_tags = None 156 remove_unknown_tags = True 157 safe_attrs_only = True 158 add_nofollow = False 159 host_whitelist = () 160 whitelist_tags = set(['iframe', 'embed']) 161
162 - def __init__(self, **kw):
163 for name, value in kw.items(): 164 if not hasattr(self, name): 165 raise TypeError( 166 "Unknown parameter: %s=%r" % (name, value)) 167 setattr(self, name, value)
168 169 # Used to lookup the primary URL for a given tag that is up for 170 # removal: 171 _tag_link_attrs = dict( 172 script='src', 173 link='href', 174 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 175 # From what I can tell, both attributes can contain a link: 176 applet=['code', 'object'], 177 iframe='src', 178 embed='src', 179 layer='src', 180 # FIXME: there doesn't really seem like a general way to figure out what 181 # links an <object> tag uses; links often go in <param> tags with values 182 # that we don't really know. You'd have to have knowledge about specific 183 # kinds of plugins (probably keyed off classid), and match against those. 184 ##object=?, 185 # FIXME: not looking at the action currently, because it is more complex 186 # than than -- if you keep the form, you should keep the form controls. 187 ##form='action', 188 a='href', 189 ) 190
191 - def __call__(self, doc):
192 """ 193 Cleans the document. 194 """ 195 if hasattr(doc, 'getroot'): 196 # ElementTree instance, instead of an element 197 doc = doc.getroot() 198 # Normalize a case that IE treats <image> like <img>, and that 199 # can confuse either this step or later steps. 200 for el in doc.iter('image'): 201 el.tag = 'img' 202 if not self.comments: 203 # Of course, if we were going to kill comments anyway, we don't 204 # need to worry about this 205 self.kill_conditional_comments(doc) 206 kill_tags = set() 207 remove_tags = set(self.remove_tags or ()) 208 if self.allow_tags: 209 allow_tags = set(self.allow_tags) 210 else: 211 allow_tags = set() 212 if self.scripts: 213 kill_tags.add('script') 214 if self.safe_attrs_only: 215 safe_attrs = set(defs.safe_attrs) 216 for el in doc.iter(): 217 attrib = el.attrib 218 for aname in attrib.keys(): 219 if aname not in safe_attrs: 220 del attrib[aname] 221 if self.javascript: 222 if not self.safe_attrs_only: 223 # safe_attrs handles events attributes itself 224 for el in doc.iter(): 225 attrib = el.attrib 226 for aname in attrib.keys(): 227 if aname.startswith('on'): 228 del attrib[aname] 229 doc.rewrite_links(self._remove_javascript_link, 230 resolve_base_href=False) 231 if not self.style: 232 # If we're deleting style then we don't have to remove JS links 233 # from styles, otherwise... 234 for el in _find_styled_elements(doc): 235 old = el.get('style') 236 new = _css_javascript_re.sub('', old) 237 new = _css_import_re.sub('', old) 238 if self._has_sneaky_javascript(new): 239 # Something tricky is going on... 240 del el.attrib['style'] 241 elif new != old: 242 el.set('style', new) 243 for el in list(doc.iter('style')): 244 if el.get('type', '').lower().strip() == 'text/javascript': 245 el.drop_tree() 246 continue 247 old = el.text or '' 248 new = _css_javascript_re.sub('', old) 249 # The imported CSS can do anything; we just can't allow: 250 new = _css_import_re.sub('', old) 251 if self._has_sneaky_javascript(new): 252 # Something tricky is going on... 253 el.text = '/* deleted */' 254 elif new != old: 255 el.text = new 256 if self.comments or self.processing_instructions: 257 # FIXME: why either? I feel like there's some obscure reason 258 # because you can put PIs in comments...? But I've already 259 # forgotten it 260 kill_tags.add(etree.Comment) 261 if self.processing_instructions: 262 kill_tags.add(etree.ProcessingInstruction) 263 if self.style: 264 kill_tags.add('style') 265 for el in _find_styled_elements(doc): 266 del el.attrib['style'] 267 if self.links: 268 kill_tags.add('link') 269 elif self.style or self.javascript: 270 # We must get rid of included stylesheets if Javascript is not 271 # allowed, as you can put Javascript in them 272 for el in list(doc.iter('link')): 273 if 'stylesheet' in el.get('rel', '').lower(): 274 # Note this kills alternate stylesheets as well 275 el.drop_tree() 276 if self.meta: 277 kill_tags.add('meta') 278 if self.page_structure: 279 remove_tags.update(('head', 'html', 'title')) 280 if self.embedded: 281 # FIXME: is <layer> really embedded? 282 # We should get rid of any <param> tags not inside <applet>; 283 # These are not really valid anyway. 284 for el in list(doc.iter('param')): 285 found_parent = False 286 parent = el.getparent() 287 while parent is not None and parent.tag not in ('applet', 'object'): 288 parent = parent.getparent() 289 if parent is None: 290 el.drop_tree() 291 kill_tags.update(('applet',)) 292 # The alternate contents that are in an iframe are a good fallback: 293 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 294 if self.frames: 295 # FIXME: ideally we should look at the frame links, but 296 # generally frames don't mix properly with an HTML 297 # fragment anyway. 298 kill_tags.update(defs.frame_tags) 299 if self.forms: 300 remove_tags.add('form') 301 kill_tags.update(('button', 'input', 'select', 'textarea')) 302 if self.annoying_tags: 303 remove_tags.update(('blink', 'marque')) 304 305 _remove = [] 306 _kill = [] 307 for el in doc.iter(): 308 if el.tag in kill_tags: 309 if self.allow_element(el): 310 continue 311 _kill.append(el) 312 elif el.tag in remove_tags: 313 if self.allow_element(el): 314 continue 315 _remove.append(el) 316 317 if _remove and _remove[0] == doc: 318 # We have to drop the parent-most tag, which we can't 319 # do. Instead we'll rewrite it: 320 el = _remove.pop(0) 321 el.tag = 'div' 322 el.attrib.clear() 323 elif _kill and _kill[0] == doc: 324 # We have to drop the parent-most element, which we can't 325 # do. Instead we'll clear it: 326 el = _kill.pop(0) 327 if el.tag != 'html': 328 el.tag = 'div' 329 el.clear() 330 331 for el in _kill: 332 el.drop_tree() 333 for el in _remove: 334 el.drop_tag() 335 336 allow_tags = self.allow_tags 337 if self.remove_unknown_tags: 338 if allow_tags: 339 raise ValueError( 340 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 341 allow_tags = set(defs.tags) 342 if allow_tags: 343 bad = [] 344 for el in doc.iter(): 345 if el.tag not in allow_tags: 346 bad.append(el) 347 for el in bad: 348 el.drop_tag() 349 if self.add_nofollow: 350 for el in _find_external_links(doc): 351 if not self.allow_follow(el): 352 el.set('rel', 'nofollow')
353
354 - def allow_follow(self, anchor):
355 """ 356 Override to suppress rel="nofollow" on some anchors. 357 """ 358 return False
359
360 - def allow_element(self, el):
361 if el.tag not in self._tag_link_attrs: 362 return False 363 attr = self._tag_link_attrs[el.tag] 364 if isinstance(attr, (list, tuple)): 365 for one_attr in attr: 366 url = el.get(one_attr) 367 if not url: 368 return False 369 if not self.allow_embedded_url(el, url): 370 return False 371 return True 372 else: 373 url = el.get(attr) 374 if not url: 375 return False 376 return self.allow_embedded_url(el, url)
377
378 - def allow_embedded_url(self, el, url):
379 if (self.whitelist_tags is not None 380 and el.tag not in self.whitelist_tags): 381 return False 382 scheme, netloc, path, query, fragment = urlparse.urlsplit(url) 383 netloc = netloc.lower().split(':', 1)[0] 384 if scheme not in ('http', 'https'): 385 return False 386 if netloc in self.host_whitelist: 387 return True 388 return False
389
390 - def kill_conditional_comments(self, doc):
391 """ 392 IE conditional comments basically embed HTML that the parser 393 doesn't normally see. We can't allow anything like that, so 394 we'll kill any comments that could be conditional. 395 """ 396 bad = [] 397 self._kill_elements( 398 doc, lambda el: _conditional_comment_re.search(el.text), 399 etree.Comment)
400
401 - def _kill_elements(self, doc, condition, iterate=None):
402 bad = [] 403 for el in doc.iter(iterate): 404 if condition(el): 405 bad.append(el) 406 for el in bad: 407 el.drop_tree()
408 416 417 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 418
419 - def _has_sneaky_javascript(self, style):
420 """ 421 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 422 can get interpreted, or ``expre/* stuff */ssion(...)``. This 423 checks for attempt to do stuff like this. 424 425 Typically the response will be to kill the entire style; if you 426 have just a bit of Javascript in the style another rule will catch 427 that and remove only the Javascript from the style; this catches 428 more sneaky attempts. 429 """ 430 style = self._substitute_comments('', style) 431 style = style.replace('\\', '') 432 style = _substitute_whitespace('', style) 433 style = style.lower() 434 if 'javascript:' in style: 435 return True 436 if 'expression(' in style: 437 return True 438 return False
439
440 - def clean_html(self, html):
441 if isinstance(html, basestring): 442 return_string = True 443 doc = fromstring(html) 444 else: 445 return_string = False 446 doc = copy.deepcopy(html) 447 self(doc) 448 if return_string: 449 return tostring(doc) 450 else: 451 return doc
452 453 clean = Cleaner() 454 clean_html = clean.clean_html 455 456 ############################################################ 457 ## Autolinking 458 ############################################################ 459 460 _link_regexes = [ 461 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), 462 # This is conservative, but autolinking can be a bit conservative: 463 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 464 ] 465 466 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 467 468 _avoid_hosts = [ 469 re.compile(r'^localhost', re.I), 470 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 471 re.compile(r'^127\.0\.0\.1$'), 472 ] 473 474 _avoid_classes = ['nolink'] 475 520 578 591 592 autolink_html.__doc__ = autolink.__doc__ 593 594 ############################################################ 595 ## Word wrapping 596 ############################################################ 597 598 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 599 _avoid_word_break_classes = ['nobreak'] 600
601 -def word_break(el, max_width=40, 602 avoid_elements=_avoid_word_break_elements, 603 avoid_classes=_avoid_word_break_classes, 604 break_character=u'\u200b'):
605 """ 606 Breaks any long words found in the body of the text (not attributes). 607 608 Doesn't effect any of the tags in avoid_elements, by default 609 ``<textarea>`` and ``<pre>`` 610 611 Breaks words by inserting &#8203;, which is a unicode character 612 for Zero Width Space character. This generally takes up no space 613 in rendering, but does copy as a space, and in monospace contexts 614 usually takes up space. 615 616 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 617 """ 618 # Character suggestion of &#8203 comes from: 619 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 620 if el.tag in _avoid_word_break_elements: 621 return 622 class_name = el.get('class') 623 if class_name: 624 dont_break = False 625 class_name = class_name.split() 626 for avoid in avoid_classes: 627 if avoid in class_name: 628 dont_break = True 629 break 630 if dont_break: 631 return 632 if el.text: 633 el.text = _break_text(el.text, max_width, break_character) 634 for child in el: 635 word_break(child, max_width=max_width, 636 avoid_elements=avoid_elements, 637 avoid_classes=avoid_classes, 638 break_character=break_character) 639 if child.tail: 640 child.tail = _break_text(child.tail, max_width, break_character)
641
642 -def word_break_html(html, *args, **kw):
643 doc = fromstring(html) 644 word_break(doc, *args, **kw) 645 return tostring(doc)
646
647 -def _break_text(text, max_width, break_character):
648 words = text.split() 649 for word in words: 650 if len(word) > max_width: 651 replacement = _insert_break(word, max_width, break_character) 652 text = text.replace(word, replacement) 653 return text
654 655 _break_prefer_re = re.compile(r'[^a-z]', re.I) 656
657 -def _insert_break(word, width, break_character):
658 orig_word = word 659 result = '' 660 while len(word) > width: 661 start = word[:width] 662 breaks = list(_break_prefer_re.finditer(start)) 663 if breaks: 664 last_break = breaks[-1] 665 # Only walk back up to 10 characters to find a nice break: 666 if last_break.end() > width-10: 667 # FIXME: should the break character be at the end of the 668 # chunk, or the beginning of the next chunk? 669 start = word[:last_break.end()] 670 result += start + break_character 671 word = word[len(start):] 672 result += word 673 return result
674