Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  import re 
  2  from lxml import etree 
  3  from lxml.html import defs 
  4  from lxml.html import fromstring, tostring 
  5   
  6  try: 
  7      set 
  8  except NameError: 
  9      from sets import Set as set 
 10   
 11  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 12             'word_break', 'word_break_html'] 
 13   
 14  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 15  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 16  # I have multiple kinds of schemes searched; but should schemes be 
 17  #   whitelisted instead? 
 18  # max height? 
 19  # remove images?  Also in CSS?  background attribute? 
 20  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 21  #   allow *just* embedded YouTube movies) 
 22  # Log what was deleted and why? 
 23  # style="behavior: ..." might be bad in IE? 
 24  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 25  #   metas. 
 26  # UTF-7 detections?  Example: 
 27  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 28  #   you don't always have to have the charset set, if the page has no charset 
 29  #   and there's UTF7-like code in it. 
 30  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 31   
 32   
 33  # This is an IE-specific construct you can have in a stylesheet to 
 34  # run some Javascript: 
 35  _css_javascript_re = re.compile( 
 36      r'expression\s*\(.*?\)', re.S|re.I) 
 37   
 38  # Do I have to worry about @\nimport? 
 39  _css_import_re = re.compile( 
 40      r'@\s*import', re.I) 
 41   
 42  # All kinds of schemes besides just javascript: that can cause 
 43  # execution: 
 44  _javascript_scheme_re = re.compile( 
 45      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 46  _whitespace_re = re.compile(r'\s+') 
 47  # FIXME: should data: be blocked? 
 48   
 49  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 50  _conditional_comment_re = re.compile( 
 51      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 52   
 53  _find_styled_elements = etree.XPath( 
 54      "descendant-or-self::*[@style]") 
 55   
 56  _find_external_links = etree.XPath( 
 57      "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") 
 58   
59 -def clean_html(html, **kw):
60 """ 61 Like clean(), but takes a text input document, and returns a text 62 document. 63 """ 64 doc = fromstring(html) 65 clean(doc, **kw) 66 return tostring(doc)
67
68 -class Cleaner(object):
69 """ 70 Instances cleans the document of each of the possible offending 71 elements. The cleaning is controlled by attributes; you can 72 override attributes in a subclass, or set them in the constructor. 73 74 ``scripts``: 75 Removes any ``<script>`` tags. 76 77 ``javascript``: 78 Removes any Javascript, like an ``onclick`` attribute. 79 80 ``comments``: 81 Removes any comments. 82 83 ``style``: 84 Removes any style tags or attributes. 85 86 ``links``: 87 Removes any ``<link>`` tags 88 89 ``meta``: 90 Removes any ``<meta>`` tags 91 92 ``page_structure``: 93 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 94 95 ``processing_instructions``: 96 Removes any processing instructions. 97 98 ``embedded``: 99 Removes any embedded objects (flash, iframes) 100 101 ``frames``: 102 Removes any frame-related tags 103 104 ``forms``: 105 Removes any form tags 106 107 ``annoying_tags``: 108 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 109 110 ``remove_tags``: 111 A list of tags to remove. 112 113 ``allow_tags``: 114 A list of tags to include (default include all). 115 116 ``remove_unknown_tags``: 117 Remove any tags that aren't standard parts of HTML. 118 119 ``safe_attrs_only``: 120 If true, only include 'safe' attributes (specifically the list 121 from `feedparser 122 <http://feedparser.org/docs/html-sanitization.html>`_). 123 124 ``add_nofollow``: 125 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 126 127 This modifies the document *in place*. 128 """ 129 130 scripts = True 131 javascript = True 132 comments = True 133 style = False 134 links = True 135 meta = True 136 page_structure = True 137 processing_instructions = True 138 embedded = True 139 frames = True 140 forms = True 141 annoying_tags = True 142 remove_tags = None 143 allow_tags = None 144 remove_unknown_tags = True 145 safe_attrs_only = True 146 add_nofollow = False 147
148 - def __init__(self, **kw):
149 for name, value in kw.items(): 150 if not hasattr(self, name): 151 raise TypeError( 152 "Unknown parameter: %s=%r" % (name, value)) 153 setattr(self, name, value)
154
155 - def __call__(self, doc):
156 """ 157 Cleans the document. 158 """ 159 if hasattr(doc, 'getroot'): 160 # ElementTree 161 doc = doc.getroot() 162 # Normalize a case that IE treats <image> like <img>, and that 163 # can confuse either this step or later steps. 164 for el in doc.getiterator('image'): 165 el.tag = 'img' 166 if not self.comments: 167 # Of course, if we were going to kill comments anyway, we don't 168 # need to worry about this 169 self.kill_conditional_comments(doc) 170 kill_tags = set() 171 remove_tags = set(self.remove_tags or ()) 172 if self.allow_tags: 173 allow_tags = set(self.allow_tags) 174 else: 175 allow_tags = set() 176 if self.scripts: 177 kill_tags.add('script') 178 if self.safe_attrs_only: 179 safe_attrs = set(defs.safe_attrs) 180 for el in doc.getiterator(): 181 attrib = el.attrib 182 for aname in attrib.keys(): 183 if aname not in safe_attrs: 184 del attrib[aname] 185 if self.javascript: 186 if not self.safe_attrs_only: 187 # safe_attrs handles events attributes itself 188 for el in doc.getiterator(): 189 attrib = el.attrib 190 for aname in attrib.keys(): 191 if aname.startswith('on'): 192 del attrib[aname] 193 doc.rewrite_links(self._remove_javascript_link, 194 resolve_base_href=False) 195 if not self.style: 196 # If we're deleting style then we don't have to remove JS links 197 # from styles, otherwise... 198 for el in _find_styled_elements(doc): 199 old = el.get('style') 200 new = _css_javascript_re.sub('', old) 201 new = _css_import_re.sub('', old) 202 if self._has_sneaky_javascript(new): 203 # Something tricky is going on... 204 del el.attrib['style'] 205 elif new != old: 206 el.set('style', new) 207 for el in list(doc.getiterator('style')): 208 if el.get('type', '').lower().strip() == 'text/javascript': 209 el.drop_tree() 210 continue 211 old = el.text or '' 212 new = _css_javascript_re.sub('', old) 213 # The imported CSS can do anything; we just can't allow: 214 new = _css_import_re.sub('', old) 215 if self._has_sneaky_javascript(new): 216 # Something tricky is going on... 217 el.text = '/* deleted */' 218 elif new != old: 219 el.text = new 220 if self.comments or self.processing_instructions: 221 # FIXME: why either? I feel like there's some obscure reason 222 # because you can put PIs in comments...? But I've already 223 # forgotten it 224 kill_tags.add(etree.Comment) 225 if self.processing_instructions: 226 kill_tags.add(etree.ProcessingInstruction) 227 if self.style: 228 kill_tags.add('style') 229 for el in _find_styled_elements(doc): 230 del el.attrib['style'] 231 if self.links: 232 kill_tags.add('link') 233 elif self.style or self.javascript: 234 # We must get rid of included stylesheets if Javascript is not 235 # allowed, as you can put Javascript in them 236 for el in list(doc.getiterator('link')): 237 if 'stylesheet' in el.get('rel', '').lower(): 238 # Note this kills alternate stylesheets as well 239 el.drop_tree() 240 if self.meta: 241 kill_tags.add('meta') 242 if self.page_structure: 243 remove_tags.update(('head', 'html', 'title')) 244 if self.embedded: 245 # FIXME: is <layer> really embedded? 246 kill_tags.update(('applet', 'param')) 247 # The alternate contents that are in an iframe are a good fallback: 248 # FIXME: somehow embed seems to be getting data, but from what I 249 # can tell the embed tag is supposed to always be empty 250 remove_tags.update(('iframe', 'object', 'embed', 'layer')) 251 if self.frames: 252 kill_tags.update(defs.frame_tags) 253 if self.forms: 254 remove_tags.add('form') 255 kill_tags.update(('button', 'input', 'select', 'textarea')) 256 if self.annoying_tags: 257 remove_tags.update(('blink', 'marque')) 258 259 _remove = [] 260 _kill = [] 261 for el in doc.getiterator(): 262 if el.tag in kill_tags: 263 _kill.append(el) 264 elif el.tag in remove_tags: 265 _remove.append(el) 266 267 if _remove and _remove[0] == doc: 268 # We have to drop the parent-most tag, which we can't 269 # do. Instead we'll rewrite it: 270 el = _remove.pop(0) 271 el.tag = 'div' 272 el.attrib.clear() 273 elif _kill and _kill[0] == doc: 274 # We have to drop the parent-most element, which we can't 275 # do. Instead we'll clear it: 276 el = _kill.pop(0) 277 if el.tag != 'html': 278 el.tag = 'div' 279 el.clear() 280 281 for el in _kill: 282 el.drop_tree() 283 for el in _remove: 284 el.drop_tag() 285 286 allow_tags = self.allow_tags 287 if self.remove_unknown_tags: 288 if allow_tags: 289 raise ValueError( 290 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 291 allow_tags = set(defs.tags) 292 if allow_tags: 293 bad = [] 294 for el in doc.getiterator(): 295 if el.tag not in allow_tags: 296 bad.append(el) 297 for el in bad: 298 el.drop_tag() 299 if self.add_nofollow: 300 for el in _find_external_links(doc): 301 el.set('rel', 'nofollow')
302
303 - def kill_conditional_comments(self, doc):
304 """ 305 IE conditional comments basically embed HTML that the parser 306 doesn't normally see. We can't allow anything like that, so 307 we'll kill any comments that could be conditional. 308 """ 309 bad = [] 310 self._kill_elements( 311 doc, lambda el: _conditional_comment_re.search(el.text), 312 etree.Comment)
313
314 - def _kill_elements(self, doc, condition, iterate=None):
315 bad = [] 316 for el in doc.getiterator(iterate): 317 if condition(el): 318 bad.append(el) 319 for el in bad: 320 el.drop_tree()
321 329 330 _decomment_re = re.compile(r'/\*.*?\*/', re.S) 331
332 - def _has_sneaky_javascript(self, style):
333 """ 334 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 335 can get interpreted, or ``expre/* stuff */ssion(...)``. This 336 checks for attempt to do stuff like this. 337 338 Typically the response will be to kill the entire style; if you 339 have just a bit of Javascript in the style another rule will catch 340 that and remove only the Javascript from the style; this catches 341 more sneaky attempts. 342 """ 343 style = self._decomment_re.sub('', style) 344 style = style.replace('\\', '') 345 style = _whitespace_re.sub('', style) 346 style = style.lower() 347 if 'javascript:' in style: 348 return True 349 if 'expression(' in style: 350 return True 351 return False
352
353 - def clean_html(self, html):
354 if isinstance(html, basestring): 355 return_string = True 356 doc = fromstring(html) 357 else: 358 return_string = False 359 doc = copy.deepcopy(html) 360 self(doc) 361 if return_string: 362 return tostring(doc) 363 else: 364 return doc
365 366 clean = Cleaner() 367 clean_html = clean.clean_html 368 369 ############################################################ 370 ## Autolinking 371 ############################################################ 372 373 _link_regexes = [ 374 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), 375 # This is conservative, but autolinking can be a bit conservative: 376 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 377 ] 378 379 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 380 381 _avoid_hosts = [ 382 re.compile(r'^localhost', re.I), 383 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 384 re.compile(r'^127\.0\.0\.1$'), 385 ] 386 387 _avoid_classes = ['nolink'] 388 433 491 504 505 autolink_html.__doc__ = autolink.__doc__ 506 507 ############################################################ 508 ## Word wrapping 509 ############################################################ 510 511 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 512 _avoid_word_break_classes = ['nobreak'] 513
514 -def word_break(el, max_width=40, 515 avoid_elements=_avoid_word_break_elements, 516 avoid_classes=_avoid_word_break_classes, 517 break_character=u'\u200b'):
518 """ 519 Breaks any long words found in the body of the text (not attributes). 520 521 Doesn't effect any of the tags in avoid_elements, by default 522 ``<textarea>`` and ``<pre>`` 523 524 Breaks words by inserting &#8203;, which is a unicode character 525 for Zero Width Space character. This generally takes up no space 526 in rendering, but does copy as a space, and in monospace contexts 527 usually takes up space. 528 529 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 530 """ 531 # Character suggestion of &#8203 comes from: 532 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 533 if el.tag in _avoid_word_break_elements: 534 return 535 class_name = el.get('class') 536 if class_name: 537 dont_break = False 538 class_name = class_name.split() 539 for avoid in avoid_classes: 540 if avoid in class_name: 541 dont_break = True 542 break 543 if dont_break: 544 return 545 if el.text: 546 el.text = _break_text(el.text, max_width, break_character) 547 for child in el: 548 word_break(child, max_width=max_width, 549 avoid_elements=avoid_elements, 550 avoid_classes=avoid_classes, 551 break_character=break_character) 552 if child.tail: 553 child.tail = _break_text(child.tail, max_width, break_character)
554
555 -def word_break_html(html, *args, **kw):
556 doc = fromstring(html) 557 word_break(doc, *args, **kw) 558 return tostring(doc)
559
560 -def _break_text(text, max_width, break_character):
561 words = text.split() 562 for word in words: 563 if len(word) > max_width: 564 replacement = _insert_break(word, max_width, break_character) 565 text = text.replace(word, replacement) 566 return text
567 568 _break_prefer_re = re.compile(r'[^a-z]', re.I) 569
570 -def _insert_break(word, width, break_character):
571 orig_word = word 572 result = '' 573 while len(word) > width: 574 start = word[:width] 575 breaks = list(_break_prefer_re.finditer(start)) 576 if breaks: 577 last_break = breaks[-1] 578 # Only walk back up to 10 characters to find a nice break: 579 if last_break.end() > width-10: 580 # FIXME: should the break character be at the end of the 581 # chunk, or the beginning of the next chunk? 582 start = word[:last_break.end()] 583 result += start + break_character 584 word = word[len(start):] 585 result += word 586 return result
587