lxml.html.clean

100 """ 101 Instances cleans the document of each of the possible offending 102 elements. The cleaning is controlled by attributes; you can 103 override attributes in a subclass, or set them in the constructor. 104 105 ``scripts``: 106 Removes any ``<script>`` tags. 107 108 ``javascript``: 109 Removes any Javascript, like an ``onclick`` attribute. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 140 141 ``remove_tags``: 142 A list of tags to remove. 143 144 ``allow_tags``: 145 A list of tags to include (default include all). 146 147 ``remove_unknown_tags``: 148 Remove any tags that aren't standard parts of HTML. 149 150 ``safe_attrs_only``: 151 If true, only include 'safe' attributes (specifically the list 152 from `feedparser 153 <http://feedparser.org/docs/html-sanitization.html>`_). 154 155 ``add_nofollow``: 156 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 157 158 ``host_whitelist``: 159 A list or set of hosts that you can use for embedded content 160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 161 You can also implement/override the method 162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 163 implement more complex rules for what can be embedded. 164 Anything that passes this test will be shown, regardless of 165 the value of (for instance) ``embedded``. 166 167 Note that this parameter might not work as intended if you do not 168 make the links absolute before doing the cleaning. 169 170 ``whitelist_tags``: 171 A set of tags that can be included with ``host_whitelist``. 172 The default is ``iframe`` and ``embed``; you may wish to 173 include other tags like ``script``, or you may want to 174 implement ``allow_embedded_url`` for more control. Set to None to 175 include all tags. 176 177 This modifies the document *in place*. 178 """ 179 180 scripts = True 181 javascript = True 182 comments = True 183 style = False 184 links = True 185 meta = True 186 page_structure = True 187 processing_instructions = True 188 embedded = True 189 frames = True 190 forms = True 191 annoying_tags = True 192 remove_tags = None 193 allow_tags = None 194 remove_unknown_tags = True 195 safe_attrs_only = True 196 add_nofollow = False 197 host_whitelist = () 198 whitelist_tags = set(['iframe', 'embed']) 199

200 - def __init__(self, **kw):

201 for name, value in kw.items(): 202 if not hasattr(self, name): 203 raise TypeError( 204 "Unknown parameter: %s=%r" % (name, value)) 205 setattr(self, name, value)

206 207 # Used to lookup the primary URL for a given tag that is up for 208 # removal: 209 _tag_link_attrs = dict( 210 script='src', 211 link='href', 212 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 213 # From what I can tell, both attributes can contain a link: 214 applet=['code', 'object'], 215 iframe='src', 216 embed='src', 217 layer='src', 218 # FIXME: there doesn't really seem like a general way to figure out what 219 # links an <object> tag uses; links often go in <param> tags with values 220 # that we don't really know. You'd have to have knowledge about specific 221 # kinds of plugins (probably keyed off classid), and match against those. 222 ##object=?, 223 # FIXME: not looking at the action currently, because it is more complex 224 # than than -- if you keep the form, you should keep the form controls. 225 ##form='action', 226 a='href', 227 ) 228

229 - def __call__(self, doc):

230 """ 231 Cleans the document. 232 """ 233 if hasattr(doc, 'getroot'): 234 # ElementTree instance, instead of an element 235 doc = doc.getroot() 236 # convert XHTML to HTML 237 for el in doc.iter(): 238 tag = el.tag 239 if isinstance(tag, basestring): 240 el.tag = _nons(tag) 241 # Normalize a case that IE treats <image> like <img>, and that 242 # can confuse either this step or later steps. 243 for el in doc.iter('image'): 244 el.tag = 'img' 245 if not self.comments: 246 # Of course, if we were going to kill comments anyway, we don't 247 # need to worry about this 248 self.kill_conditional_comments(doc) 249 kill_tags = set() 250 remove_tags = set(self.remove_tags or ()) 251 if self.allow_tags: 252 allow_tags = set(self.allow_tags) 253 else: 254 allow_tags = set() 255 if self.scripts: 256 kill_tags.add('script') 257 if self.safe_attrs_only: 258 safe_attrs = set(defs.safe_attrs) 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname not in safe_attrs: 263 del attrib[aname] 264 if self.javascript: 265 if not self.safe_attrs_only: 266 # safe_attrs handles events attributes itself 267 for el in doc.iter(): 268 attrib = el.attrib 269 for aname in attrib.keys(): 270 if aname.startswith('on'): 271 del attrib[aname] 272 doc.rewrite_links(self._remove_javascript_link, 273 resolve_base_href=False) 274 if not self.style: 275 # If we're deleting style then we don't have to remove JS links 276 # from styles, otherwise... 277 for el in _find_styled_elements(doc): 278 old = el.get('style') 279 new = _css_javascript_re.sub('', old) 280 new = _css_import_re.sub('', old) 281 if self._has_sneaky_javascript(new): 282 # Something tricky is going on... 283 del el.attrib['style'] 284 elif new != old: 285 el.set('style', new) 286 for el in list(doc.iter('style')): 287 if el.get('type', '').lower().strip() == 'text/javascript': 288 el.drop_tree() 289 continue 290 old = el.text or '' 291 new = _css_javascript_re.sub('', old) 292 # The imported CSS can do anything; we just can't allow: 293 new = _css_import_re.sub('', old) 294 if self._has_sneaky_javascript(new): 295 # Something tricky is going on... 296 el.text = '/* deleted */' 297 elif new != old: 298 el.text = new 299 if self.comments or self.processing_instructions: 300 # FIXME: why either? I feel like there's some obscure reason 301 # because you can put PIs in comments...? But I've already 302 # forgotten it 303 kill_tags.add(etree.Comment) 304 if self.processing_instructions: 305 kill_tags.add(etree.ProcessingInstruction) 306 if self.style: 307 kill_tags.add('style') 308 etree.strip_attributes(doc, 'style') 309 if self.links: 310 kill_tags.add('link') 311 elif self.style or self.javascript: 312 # We must get rid of included stylesheets if Javascript is not 313 # allowed, as you can put Javascript in them 314 for el in list(doc.iter('link')): 315 if 'stylesheet' in el.get('rel', '').lower(): 316 # Note this kills alternate stylesheets as well 317 el.drop_tree() 318 if self.meta: 319 kill_tags.add('meta') 320 if self.page_structure: 321 remove_tags.update(('head', 'html', 'title')) 322 if self.embedded: 323 # FIXME: is <layer> really embedded? 324 # We should get rid of any <param> tags not inside <applet>; 325 # These are not really valid anyway. 326 for el in list(doc.iter('param')): 327 found_parent = False 328 parent = el.getparent() 329 while parent is not None and parent.tag not in ('applet', 'object'): 330 parent = parent.getparent() 331 if parent is None: 332 el.drop_tree() 333 kill_tags.update(('applet',)) 334 # The alternate contents that are in an iframe are a good fallback: 335 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 336 if self.frames: 337 # FIXME: ideally we should look at the frame links, but 338 # generally frames don't mix properly with an HTML 339 # fragment anyway. 340 kill_tags.update(defs.frame_tags) 341 if self.forms: 342 remove_tags.add('form') 343 kill_tags.update(('button', 'input', 'select', 'textarea')) 344 if self.annoying_tags: 345 remove_tags.update(('blink', 'marque')) 346 347 _remove = [] 348 _kill = [] 349 for el in doc.iter(): 350 if el.tag in kill_tags: 351 if self.allow_element(el): 352 continue 353 _kill.append(el) 354 elif el.tag in remove_tags: 355 if self.allow_element(el): 356 continue 357 _remove.append(el) 358 359 if _remove and _remove[0] == doc: 360 # We have to drop the parent-most tag, which we can't 361 # do. Instead we'll rewrite it: 362 el = _remove.pop(0) 363 el.tag = 'div' 364 el.attrib.clear() 365 elif _kill and _kill[0] == doc: 366 # We have to drop the parent-most element, which we can't 367 # do. Instead we'll clear it: 368 el = _kill.pop(0) 369 if el.tag != 'html': 370 el.tag = 'div' 371 el.clear() 372 373 for el in _kill: 374 el.drop_tree() 375 for el in _remove: 376 el.drop_tag() 377 378 allow_tags = self.allow_tags 379 if self.remove_unknown_tags: 380 if allow_tags: 381 raise ValueError( 382 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 383 allow_tags = set(defs.tags) 384 if allow_tags: 385 bad = [] 386 for el in doc.iter(): 387 if el.tag not in allow_tags: 388 bad.append(el) 389 for el in bad: 390 el.drop_tag() 391 if self.add_nofollow: 392 for el in _find_external_links(doc): 393 if not self.allow_follow(el): 394 el.set('rel', 'nofollow')

395

396 - def allow_follow(self, anchor):

397 """ 398 Override to suppress rel="nofollow" on some anchors. 399 """ 400 return False

401

402 - def allow_element(self, el):

403 if el.tag not in self._tag_link_attrs: 404 return False 405 attr = self._tag_link_attrs[el.tag] 406 if isinstance(attr, (list, tuple)): 407 for one_attr in attr: 408 url = el.get(one_attr) 409 if not url: 410 return False 411 if not self.allow_embedded_url(el, url): 412 return False 413 return True 414 else: 415 url = el.get(attr) 416 if not url: 417 return False 418 return self.allow_embedded_url(el, url)

419

420 - def allow_embedded_url(self, el, url):

421 if (self.whitelist_tags is not None 422 and el.tag not in self.whitelist_tags): 423 return False 424 scheme, netloc, path, query, fragment = urlsplit(url) 425 netloc = netloc.lower().split(':', 1)[0] 426 if scheme not in ('http', 'https'): 427 return False 428 if netloc in self.host_whitelist: 429 return True 430 return False

431

432 - def kill_conditional_comments(self, doc):

433 """ 434 IE conditional comments basically embed HTML that the parser 435 doesn't normally see. We can't allow anything like that, so 436 we'll kill any comments that could be conditional. 437 """ 438 bad = [] 439 self._kill_elements( 440 doc, lambda el: _conditional_comment_re.search(el.text), 441 etree.Comment)

442

443 - def _kill_elements(self, doc, condition, iterate=None):

444 bad = [] 445 for el in doc.iter(iterate): 446 if condition(el): 447 bad.append(el) 448 for el in bad: 449 el.drop_tree()

450

451 - def _remove_javascript_link(self, link):

452 # links like "j a v a s c r i p t:" might be interpreted in IE 453 new = _substitute_whitespace('', link) 454 if _javascript_scheme_re.search(new): 455 # FIXME: should this be None to delete? 456 return '' 457 return link

458 459 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 460

461 - def _has_sneaky_javascript(self, style):

462 """ 463 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 464 can get interpreted, or ``expre/* stuff */ssion(...)``. This 465 checks for attempt to do stuff like this. 466 467 Typically the response will be to kill the entire style; if you 468 have just a bit of Javascript in the style another rule will catch 469 that and remove only the Javascript from the style; this catches 470 more sneaky attempts. 471 """ 472 style = self._substitute_comments('', style) 473 style = style.replace('\\', '') 474 style = _substitute_whitespace('', style) 475 style = style.lower() 476 if 'javascript:' in style: 477 return True 478 if 'expression(' in style: 479 return True 480 return False

481

482 - def clean_html(self, html):

483 result_type = type(html) 484 if isinstance(html, basestring): 485 doc = fromstring(html) 486 else: 487 doc = copy.deepcopy(html) 488 self(doc) 489 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean