lxml.html.clean

91 """ 92 Instances cleans the document of each of the possible offending 93 elements. The cleaning is controlled by attributes; you can 94 override attributes in a subclass, or set them in the constructor. 95 96 ``scripts``: 97 Removes any ``<script>`` tags. 98 99 ``javascript``: 100 Removes any Javascript, like an ``onclick`` attribute. 101 102 ``comments``: 103 Removes any comments. 104 105 ``style``: 106 Removes any style tags or attributes. 107 108 ``links``: 109 Removes any ``<link>`` tags 110 111 ``meta``: 112 Removes any ``<meta>`` tags 113 114 ``page_structure``: 115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 116 117 ``processing_instructions``: 118 Removes any processing instructions. 119 120 ``embedded``: 121 Removes any embedded objects (flash, iframes) 122 123 ``frames``: 124 Removes any frame-related tags 125 126 ``forms``: 127 Removes any form tags 128 129 ``annoying_tags``: 130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 131 132 ``remove_tags``: 133 A list of tags to remove. Only the tags will be removed, 134 their content will get pulled up into the parent tag. 135 136 ``kill_tags``: 137 A list of tags to kill. Killing also removes the tag's content, 138 i.e. the whole subtree, not just the tag itself. 139 140 ``allow_tags``: 141 A list of tags to include (default include all). 142 143 ``remove_unknown_tags``: 144 Remove any tags that aren't standard parts of HTML. 145 146 ``safe_attrs_only``: 147 If true, only include 'safe' attributes (specifically the list 148 from the feedparser HTML sanitisation web site). 149 150 ``add_nofollow``: 151 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 152 153 ``host_whitelist``: 154 A list or set of hosts that you can use for embedded content 155 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 156 You can also implement/override the method 157 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 158 implement more complex rules for what can be embedded. 159 Anything that passes this test will be shown, regardless of 160 the value of (for instance) ``embedded``. 161 162 Note that this parameter might not work as intended if you do not 163 make the links absolute before doing the cleaning. 164 165 ``whitelist_tags``: 166 A set of tags that can be included with ``host_whitelist``. 167 The default is ``iframe`` and ``embed``; you may wish to 168 include other tags like ``script``, or you may want to 169 implement ``allow_embedded_url`` for more control. Set to None to 170 include all tags. 171 172 This modifies the document *in place*. 173 """ 174 175 scripts = True 176 javascript = True 177 comments = True 178 style = False 179 links = True 180 meta = True 181 page_structure = True 182 processing_instructions = True 183 embedded = True 184 frames = True 185 forms = True 186 annoying_tags = True 187 remove_tags = None 188 allow_tags = None 189 kill_tags = None 190 remove_unknown_tags = True 191 safe_attrs_only = True 192 add_nofollow = False 193 host_whitelist = () 194 whitelist_tags = set(['iframe', 'embed']) 195

196 - def __init__(self, **kw):

197 for name, value in kw.items(): 198 if not hasattr(self, name): 199 raise TypeError( 200 "Unknown parameter: %s=%r" % (name, value)) 201 setattr(self, name, value)

202 203 # Used to lookup the primary URL for a given tag that is up for 204 # removal: 205 _tag_link_attrs = dict( 206 script='src', 207 link='href', 208 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 209 # From what I can tell, both attributes can contain a link: 210 applet=['code', 'object'], 211 iframe='src', 212 embed='src', 213 layer='src', 214 # FIXME: there doesn't really seem like a general way to figure out what 215 # links an <object> tag uses; links often go in <param> tags with values 216 # that we don't really know. You'd have to have knowledge about specific 217 # kinds of plugins (probably keyed off classid), and match against those. 218 ##object=?, 219 # FIXME: not looking at the action currently, because it is more complex 220 # than than -- if you keep the form, you should keep the form controls. 221 ##form='action', 222 a='href', 223 ) 224

225 - def __call__(self, doc):

226 """ 227 Cleans the document. 228 """ 229 if hasattr(doc, 'getroot'): 230 # ElementTree instance, instead of an element 231 doc = doc.getroot() 232 # convert XHTML to HTML 233 xhtml_to_html(doc) 234 # Normalize a case that IE treats <image> like <img>, and that 235 # can confuse either this step or later steps. 236 for el in doc.iter('image'): 237 el.tag = 'img' 238 if not self.comments: 239 # Of course, if we were going to kill comments anyway, we don't 240 # need to worry about this 241 self.kill_conditional_comments(doc) 242 243 kill_tags = set(self.kill_tags or ()) 244 remove_tags = set(self.remove_tags or ()) 245 allow_tags = set(self.allow_tags or ()) 246 247 if self.scripts: 248 kill_tags.add('script') 249 if self.safe_attrs_only: 250 safe_attrs = set(defs.safe_attrs) 251 for el in doc.iter(): 252 attrib = el.attrib 253 for aname in attrib.keys(): 254 if aname not in safe_attrs: 255 del attrib[aname] 256 if self.javascript: 257 if not self.safe_attrs_only: 258 # safe_attrs handles events attributes itself 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname.startswith('on'): 263 del attrib[aname] 264 doc.rewrite_links(self._remove_javascript_link, 265 resolve_base_href=False) 266 if not self.style: 267 # If we're deleting style then we don't have to remove JS links 268 # from styles, otherwise... 269 for el in _find_styled_elements(doc): 270 old = el.get('style') 271 new = _css_javascript_re.sub('', old) 272 new = _css_import_re.sub('', old) 273 if self._has_sneaky_javascript(new): 274 # Something tricky is going on... 275 del el.attrib['style'] 276 elif new != old: 277 el.set('style', new) 278 for el in list(doc.iter('style')): 279 if el.get('type', '').lower().strip() == 'text/javascript': 280 el.drop_tree() 281 continue 282 old = el.text or '' 283 new = _css_javascript_re.sub('', old) 284 # The imported CSS can do anything; we just can't allow: 285 new = _css_import_re.sub('', old) 286 if self._has_sneaky_javascript(new): 287 # Something tricky is going on... 288 el.text = '/* deleted */' 289 elif new != old: 290 el.text = new 291 if self.comments or self.processing_instructions: 292 # FIXME: why either? I feel like there's some obscure reason 293 # because you can put PIs in comments...? But I've already 294 # forgotten it 295 kill_tags.add(etree.Comment) 296 if self.processing_instructions: 297 kill_tags.add(etree.ProcessingInstruction) 298 if self.style: 299 kill_tags.add('style') 300 etree.strip_attributes(doc, 'style') 301 if self.links: 302 kill_tags.add('link') 303 elif self.style or self.javascript: 304 # We must get rid of included stylesheets if Javascript is not 305 # allowed, as you can put Javascript in them 306 for el in list(doc.iter('link')): 307 if 'stylesheet' in el.get('rel', '').lower(): 308 # Note this kills alternate stylesheets as well 309 el.drop_tree() 310 if self.meta: 311 kill_tags.add('meta') 312 if self.page_structure: 313 remove_tags.update(('head', 'html', 'title')) 314 if self.embedded: 315 # FIXME: is <layer> really embedded? 316 # We should get rid of any <param> tags not inside <applet>; 317 # These are not really valid anyway. 318 for el in list(doc.iter('param')): 319 found_parent = False 320 parent = el.getparent() 321 while parent is not None and parent.tag not in ('applet', 'object'): 322 parent = parent.getparent() 323 if parent is None: 324 el.drop_tree() 325 kill_tags.update(('applet',)) 326 # The alternate contents that are in an iframe are a good fallback: 327 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 328 if self.frames: 329 # FIXME: ideally we should look at the frame links, but 330 # generally frames don't mix properly with an HTML 331 # fragment anyway. 332 kill_tags.update(defs.frame_tags) 333 if self.forms: 334 remove_tags.add('form') 335 kill_tags.update(('button', 'input', 'select', 'textarea')) 336 if self.annoying_tags: 337 remove_tags.update(('blink', 'marquee')) 338 339 _remove = [] 340 _kill = [] 341 for el in doc.iter(): 342 if el.tag in kill_tags: 343 if self.allow_element(el): 344 continue 345 _kill.append(el) 346 elif el.tag in remove_tags: 347 if self.allow_element(el): 348 continue 349 _remove.append(el) 350 351 if _remove and _remove[0] == doc: 352 # We have to drop the parent-most tag, which we can't 353 # do. Instead we'll rewrite it: 354 el = _remove.pop(0) 355 el.tag = 'div' 356 el.attrib.clear() 357 elif _kill and _kill[0] == doc: 358 # We have to drop the parent-most element, which we can't 359 # do. Instead we'll clear it: 360 el = _kill.pop(0) 361 if el.tag != 'html': 362 el.tag = 'div' 363 el.clear() 364 365 _kill.reverse() # start with innermost tags 366 for el in _kill: 367 el.drop_tree() 368 for el in _remove: 369 el.drop_tag() 370 371 allow_tags = self.allow_tags 372 if self.remove_unknown_tags: 373 if allow_tags: 374 raise ValueError( 375 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 376 allow_tags = set(defs.tags) 377 if allow_tags: 378 bad = [] 379 for el in doc.iter(): 380 if el.tag not in allow_tags: 381 bad.append(el) 382 if bad: 383 if bad[0] is doc: 384 el = bad.pop(0) 385 el.tag = 'div' 386 el.attrib.clear() 387 for el in bad: 388 el.drop_tag() 389 if self.add_nofollow: 390 for el in _find_external_links(doc): 391 if not self.allow_follow(el): 392 el.set('rel', 'nofollow')

393

394 - def allow_follow(self, anchor):

395 """ 396 Override to suppress rel="nofollow" on some anchors. 397 """ 398 return False

399

400 - def allow_element(self, el):

401 if el.tag not in self._tag_link_attrs: 402 return False 403 attr = self._tag_link_attrs[el.tag] 404 if isinstance(attr, (list, tuple)): 405 for one_attr in attr: 406 url = el.get(one_attr) 407 if not url: 408 return False 409 if not self.allow_embedded_url(el, url): 410 return False 411 return True 412 else: 413 url = el.get(attr) 414 if not url: 415 return False 416 return self.allow_embedded_url(el, url)

417

418 - def allow_embedded_url(self, el, url):

419 if (self.whitelist_tags is not None 420 and el.tag not in self.whitelist_tags): 421 return False 422 scheme, netloc, path, query, fragment = urlsplit(url) 423 netloc = netloc.lower().split(':', 1)[0] 424 if scheme not in ('http', 'https'): 425 return False 426 if netloc in self.host_whitelist: 427 return True 428 return False

429

430 - def kill_conditional_comments(self, doc):

431 """ 432 IE conditional comments basically embed HTML that the parser 433 doesn't normally see. We can't allow anything like that, so 434 we'll kill any comments that could be conditional. 435 """ 436 bad = [] 437 self._kill_elements( 438 doc, lambda el: _conditional_comment_re.search(el.text), 439 etree.Comment)

440

441 - def _kill_elements(self, doc, condition, iterate=None):

442 bad = [] 443 for el in doc.iter(iterate): 444 if condition(el): 445 bad.append(el) 446 for el in bad: 447 el.drop_tree()

448

449 - def _remove_javascript_link(self, link):

450 # links like "j a v a s c r i p t:" might be interpreted in IE 451 new = _substitute_whitespace('', link) 452 if _javascript_scheme_re.search(new): 453 # FIXME: should this be None to delete? 454 return '' 455 return link

456 457 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 458

459 - def _has_sneaky_javascript(self, style):

460 """ 461 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 462 can get interpreted, or ``expre/* stuff */ssion(...)``. This 463 checks for attempt to do stuff like this. 464 465 Typically the response will be to kill the entire style; if you 466 have just a bit of Javascript in the style another rule will catch 467 that and remove only the Javascript from the style; this catches 468 more sneaky attempts. 469 """ 470 style = self._substitute_comments('', style) 471 style = style.replace('\\', '') 472 style = _substitute_whitespace('', style) 473 style = style.lower() 474 if 'javascript:' in style: 475 return True 476 if 'expression(' in style: 477 return True 478 return False

479

480 - def clean_html(self, html):

481 result_type = type(html) 482 if isinstance(html, basestring): 483 doc = fromstring(html) 484 else: 485 doc = copy.deepcopy(html) 486 self(doc) 487 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean