lxml.html.clean

68 """ 69 Instances cleans the document of each of the possible offending 70 elements. The cleaning is controlled by attributes; you can 71 override attributes in a subclass, or set them in the constructor. 72 73 ``scripts``: 74 Removes any ``<script>`` tags. 75 76 ``javascript``: 77 Removes any Javascript, like an ``onclick`` attribute. 78 79 ``comments``: 80 Removes any comments. 81 82 ``style``: 83 Removes any style tags or attributes. 84 85 ``links``: 86 Removes any ``<link>`` tags 87 88 ``meta``: 89 Removes any ``<meta>`` tags 90 91 ``page_structure``: 92 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 93 94 ``processing_instructions``: 95 Removes any processing instructions. 96 97 ``embedded``: 98 Removes any embedded objects (flash, iframes) 99 100 ``frames``: 101 Removes any frame-related tags 102 103 ``forms``: 104 Removes any form tags 105 106 ``annoying_tags``: 107 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 108 109 ``remove_tags``: 110 A list of tags to remove. 111 112 ``allow_tags``: 113 A list of tags to include (default include all). 114 115 ``remove_unknown_tags``: 116 Remove any tags that aren't standard parts of HTML. 117 118 ``safe_attrs_only``: 119 If true, only include 'safe' attributes (specifically the list 120 from `feedparser 121 <http://feedparser.org/docs/html-sanitization.html>`_). 122 123 ``add_nofollow``: 124 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 125 126 ``host_whitelist``: 127 A list or set of hosts that you can use for embedded content 128 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 129 You can also implement/override the method 130 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 131 implement more complex rules for what can be embedded. 132 Anything that passes this test will be shown, regardless of 133 the value of (for instance) ``embedded``. 134 135 Note that this parameter might not work as intended if you do not 136 make the links absolute before doing the cleaning. 137 138 ``whitelist_tags``: 139 A set of tags that can be included with ``host_whitelist``. 140 The default is ``iframe`` and ``embed``; you may wish to 141 include other tags like ``script``, or you may want to 142 implement ``allow_embedded_url`` for more control. Set to None to 143 include all tags. 144 145 This modifies the document *in place*. 146 """ 147 148 scripts = True 149 javascript = True 150 comments = True 151 style = False 152 links = True 153 meta = True 154 page_structure = True 155 processing_instructions = True 156 embedded = True 157 frames = True 158 forms = True 159 annoying_tags = True 160 remove_tags = None 161 allow_tags = None 162 remove_unknown_tags = True 163 safe_attrs_only = True 164 add_nofollow = False 165 host_whitelist = () 166 whitelist_tags = set(['iframe', 'embed']) 167

168 - def __init__(self, **kw):

169 for name, value in kw.items(): 170 if not hasattr(self, name): 171 raise TypeError( 172 "Unknown parameter: %s=%r" % (name, value)) 173 setattr(self, name, value)

174 175 # Used to lookup the primary URL for a given tag that is up for 176 # removal: 177 _tag_link_attrs = dict( 178 script='src', 179 link='href', 180 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 181 # From what I can tell, both attributes can contain a link: 182 applet=['code', 'object'], 183 iframe='src', 184 embed='src', 185 layer='src', 186 # FIXME: there doesn't really seem like a general way to figure out what 187 # links an <object> tag uses; links often go in <param> tags with values 188 # that we don't really know. You'd have to have knowledge about specific 189 # kinds of plugins (probably keyed off classid), and match against those. 190 ##object=?, 191 # FIXME: not looking at the action currently, because it is more complex 192 # than than -- if you keep the form, you should keep the form controls. 193 ##form='action', 194 a='href', 195 ) 196

197 - def __call__(self, doc):

198 """ 199 Cleans the document. 200 """ 201 if hasattr(doc, 'getroot'): 202 # ElementTree instance, instead of an element 203 doc = doc.getroot() 204 # Normalize a case that IE treats <image> like <img>, and that 205 # can confuse either this step or later steps. 206 for el in doc.iter('image'): 207 el.tag = 'img' 208 if not self.comments: 209 # Of course, if we were going to kill comments anyway, we don't 210 # need to worry about this 211 self.kill_conditional_comments(doc) 212 kill_tags = set() 213 remove_tags = set(self.remove_tags or ()) 214 if self.allow_tags: 215 allow_tags = set(self.allow_tags) 216 else: 217 allow_tags = set() 218 if self.scripts: 219 kill_tags.add('script') 220 if self.safe_attrs_only: 221 safe_attrs = set(defs.safe_attrs) 222 for el in doc.iter(): 223 attrib = el.attrib 224 for aname in attrib.keys(): 225 if aname not in safe_attrs: 226 del attrib[aname] 227 if self.javascript: 228 if not self.safe_attrs_only: 229 # safe_attrs handles events attributes itself 230 for el in doc.iter(): 231 attrib = el.attrib 232 for aname in attrib.keys(): 233 if aname.startswith('on'): 234 del attrib[aname] 235 doc.rewrite_links(self._remove_javascript_link, 236 resolve_base_href=False) 237 if not self.style: 238 # If we're deleting style then we don't have to remove JS links 239 # from styles, otherwise... 240 for el in _find_styled_elements(doc): 241 old = el.get('style') 242 new = _css_javascript_re.sub('', old) 243 new = _css_import_re.sub('', old) 244 if self._has_sneaky_javascript(new): 245 # Something tricky is going on... 246 del el.attrib['style'] 247 elif new != old: 248 el.set('style', new) 249 for el in list(doc.iter('style')): 250 if el.get('type', '').lower().strip() == 'text/javascript': 251 el.drop_tree() 252 continue 253 old = el.text or '' 254 new = _css_javascript_re.sub('', old) 255 # The imported CSS can do anything; we just can't allow: 256 new = _css_import_re.sub('', old) 257 if self._has_sneaky_javascript(new): 258 # Something tricky is going on... 259 el.text = '/* deleted */' 260 elif new != old: 261 el.text = new 262 if self.comments or self.processing_instructions: 263 # FIXME: why either? I feel like there's some obscure reason 264 # because you can put PIs in comments...? But I've already 265 # forgotten it 266 kill_tags.add(etree.Comment) 267 if self.processing_instructions: 268 kill_tags.add(etree.ProcessingInstruction) 269 if self.style: 270 kill_tags.add('style') 271 for el in _find_styled_elements(doc): 272 del el.attrib['style'] 273 if self.links: 274 kill_tags.add('link') 275 elif self.style or self.javascript: 276 # We must get rid of included stylesheets if Javascript is not 277 # allowed, as you can put Javascript in them 278 for el in list(doc.iter('link')): 279 if 'stylesheet' in el.get('rel', '').lower(): 280 # Note this kills alternate stylesheets as well 281 el.drop_tree() 282 if self.meta: 283 kill_tags.add('meta') 284 if self.page_structure: 285 remove_tags.update(('head', 'html', 'title')) 286 if self.embedded: 287 # FIXME: is <layer> really embedded? 288 # We should get rid of any <param> tags not inside <applet>; 289 # These are not really valid anyway. 290 for el in list(doc.iter('param')): 291 found_parent = False 292 parent = el.getparent() 293 while parent is not None and parent.tag not in ('applet', 'object'): 294 parent = parent.getparent() 295 if parent is None: 296 el.drop_tree() 297 kill_tags.update(('applet',)) 298 # The alternate contents that are in an iframe are a good fallback: 299 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 300 if self.frames: 301 # FIXME: ideally we should look at the frame links, but 302 # generally frames don't mix properly with an HTML 303 # fragment anyway. 304 kill_tags.update(defs.frame_tags) 305 if self.forms: 306 remove_tags.add('form') 307 kill_tags.update(('button', 'input', 'select', 'textarea')) 308 if self.annoying_tags: 309 remove_tags.update(('blink', 'marque')) 310 311 _remove = [] 312 _kill = [] 313 for el in doc.iter(): 314 if el.tag in kill_tags: 315 if self.allow_element(el): 316 continue 317 _kill.append(el) 318 elif el.tag in remove_tags: 319 if self.allow_element(el): 320 continue 321 _remove.append(el) 322 323 if _remove and _remove[0] == doc: 324 # We have to drop the parent-most tag, which we can't 325 # do. Instead we'll rewrite it: 326 el = _remove.pop(0) 327 el.tag = 'div' 328 el.attrib.clear() 329 elif _kill and _kill[0] == doc: 330 # We have to drop the parent-most element, which we can't 331 # do. Instead we'll clear it: 332 el = _kill.pop(0) 333 if el.tag != 'html': 334 el.tag = 'div' 335 el.clear() 336 337 for el in _kill: 338 el.drop_tree() 339 for el in _remove: 340 el.drop_tag() 341 342 allow_tags = self.allow_tags 343 if self.remove_unknown_tags: 344 if allow_tags: 345 raise ValueError( 346 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 347 allow_tags = set(defs.tags) 348 if allow_tags: 349 bad = [] 350 for el in doc.iter(): 351 if el.tag not in allow_tags: 352 bad.append(el) 353 for el in bad: 354 el.drop_tag() 355 if self.add_nofollow: 356 for el in _find_external_links(doc): 357 if not self.allow_follow(el): 358 el.set('rel', 'nofollow')

359

360 - def allow_follow(self, anchor):

361 """ 362 Override to suppress rel="nofollow" on some anchors. 363 """ 364 return False

365

366 - def allow_element(self, el):

367 if el.tag not in self._tag_link_attrs: 368 return False 369 attr = self._tag_link_attrs[el.tag] 370 if isinstance(attr, (list, tuple)): 371 for one_attr in attr: 372 url = el.get(one_attr) 373 if not url: 374 return False 375 if not self.allow_embedded_url(el, url): 376 return False 377 return True 378 else: 379 url = el.get(attr) 380 if not url: 381 return False 382 return self.allow_embedded_url(el, url)

383

384 - def allow_embedded_url(self, el, url):

385 if (self.whitelist_tags is not None 386 and el.tag not in self.whitelist_tags): 387 return False 388 scheme, netloc, path, query, fragment = urlparse.urlsplit(url) 389 netloc = netloc.lower().split(':', 1)[0] 390 if scheme not in ('http', 'https'): 391 return False 392 if netloc in self.host_whitelist: 393 return True 394 return False

395

396 - def kill_conditional_comments(self, doc):

397 """ 398 IE conditional comments basically embed HTML that the parser 399 doesn't normally see. We can't allow anything like that, so 400 we'll kill any comments that could be conditional. 401 """ 402 bad = [] 403 self._kill_elements( 404 doc, lambda el: _conditional_comment_re.search(el.text), 405 etree.Comment)

406

407 - def _kill_elements(self, doc, condition, iterate=None):

408 bad = [] 409 for el in doc.iter(iterate): 410 if condition(el): 411 bad.append(el) 412 for el in bad: 413 el.drop_tree()

414

415 - def _remove_javascript_link(self, link):

416 # links like "j a v a s c r i p t:" might be interpreted in IE 417 new = _substitute_whitespace('', link) 418 if _javascript_scheme_re.search(new): 419 # FIXME: should this be None to delete? 420 return '' 421 return link

422 423 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 424

425 - def _has_sneaky_javascript(self, style):

426 """ 427 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 428 can get interpreted, or ``expre/* stuff */ssion(...)``. This 429 checks for attempt to do stuff like this. 430 431 Typically the response will be to kill the entire style; if you 432 have just a bit of Javascript in the style another rule will catch 433 that and remove only the Javascript from the style; this catches 434 more sneaky attempts. 435 """ 436 style = self._substitute_comments('', style) 437 style = style.replace('\\', '') 438 style = _substitute_whitespace('', style) 439 style = style.lower() 440 if 'javascript:' in style: 441 return True 442 if 'expression(' in style: 443 return True 444 return False

445

446 - def clean_html(self, html):

447 if isinstance(html, basestring): 448 return_string = True 449 doc = fromstring(html) 450 else: 451 return_string = False 452 doc = copy.deepcopy(html) 453 self(doc) 454 if return_string: 455 return tostring(doc) 456 else: 457 return doc

Source Code for Module lxml.html.clean