lxml.html.clean

91 """ 92 Instances cleans the document of each of the possible offending 93 elements. The cleaning is controlled by attributes; you can 94 override attributes in a subclass, or set them in the constructor. 95 96 ``scripts``: 97 Removes any ``<script>`` tags. 98 99 ``javascript``: 100 Removes any Javascript, like an ``onclick`` attribute. 101 102 ``comments``: 103 Removes any comments. 104 105 ``style``: 106 Removes any style tags or attributes. 107 108 ``links``: 109 Removes any ``<link>`` tags 110 111 ``meta``: 112 Removes any ``<meta>`` tags 113 114 ``page_structure``: 115 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 116 117 ``processing_instructions``: 118 Removes any processing instructions. 119 120 ``embedded``: 121 Removes any embedded objects (flash, iframes) 122 123 ``frames``: 124 Removes any frame-related tags 125 126 ``forms``: 127 Removes any form tags 128 129 ``annoying_tags``: 130 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 131 132 ``remove_tags``: 133 A list of tags to remove. Only the tags will be removed, 134 their content will get pulled up into the parent tag. 135 136 ``kill_tags``: 137 A list of tags to kill. Killing also removes the tag's content, 138 i.e. the whole subtree, not just the tag itself. 139 140 ``allow_tags``: 141 A list of tags to include (default include all). 142 143 ``remove_unknown_tags``: 144 Remove any tags that aren't standard parts of HTML. 145 146 ``safe_attrs_only``: 147 If true, only include 'safe' attributes (specifically the list 148 from the feedparser HTML sanitisation web site). 149 150 ``safe_attrs``: 151 A set of attribute names to override the default list of attributes 152 considered 'safe' (when safe_attrs_only=True). 153 154 ``add_nofollow``: 155 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 156 157 ``host_whitelist``: 158 A list or set of hosts that you can use for embedded content 159 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 160 You can also implement/override the method 161 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 162 implement more complex rules for what can be embedded. 163 Anything that passes this test will be shown, regardless of 164 the value of (for instance) ``embedded``. 165 166 Note that this parameter might not work as intended if you do not 167 make the links absolute before doing the cleaning. 168 169 ``whitelist_tags``: 170 A set of tags that can be included with ``host_whitelist``. 171 The default is ``iframe`` and ``embed``; you may wish to 172 include other tags like ``script``, or you may want to 173 implement ``allow_embedded_url`` for more control. Set to None to 174 include all tags. 175 176 This modifies the document *in place*. 177 """ 178 179 scripts = True 180 javascript = True 181 comments = True 182 style = False 183 links = True 184 meta = True 185 page_structure = True 186 processing_instructions = True 187 embedded = True 188 frames = True 189 forms = True 190 annoying_tags = True 191 remove_tags = None 192 allow_tags = None 193 kill_tags = None 194 remove_unknown_tags = True 195 safe_attrs_only = True 196 safe_attrs = defs.safe_attrs 197 add_nofollow = False 198 host_whitelist = () 199 whitelist_tags = set(['iframe', 'embed']) 200

201 - def __init__(self, **kw):

202 for name, value in kw.items(): 203 if not hasattr(self, name): 204 raise TypeError( 205 "Unknown parameter: %s=%r" % (name, value)) 206 setattr(self, name, value)

207 208 # Used to lookup the primary URL for a given tag that is up for 209 # removal: 210 _tag_link_attrs = dict( 211 script='src', 212 link='href', 213 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 214 # From what I can tell, both attributes can contain a link: 215 applet=['code', 'object'], 216 iframe='src', 217 embed='src', 218 layer='src', 219 # FIXME: there doesn't really seem like a general way to figure out what 220 # links an <object> tag uses; links often go in <param> tags with values 221 # that we don't really know. You'd have to have knowledge about specific 222 # kinds of plugins (probably keyed off classid), and match against those. 223 ##object=?, 224 # FIXME: not looking at the action currently, because it is more complex 225 # than than -- if you keep the form, you should keep the form controls. 226 ##form='action', 227 a='href', 228 ) 229

230 - def __call__(self, doc):

231 """ 232 Cleans the document. 233 """ 234 if hasattr(doc, 'getroot'): 235 # ElementTree instance, instead of an element 236 doc = doc.getroot() 237 # convert XHTML to HTML 238 xhtml_to_html(doc) 239 # Normalize a case that IE treats <image> like <img>, and that 240 # can confuse either this step or later steps. 241 for el in doc.iter('image'): 242 el.tag = 'img' 243 if not self.comments: 244 # Of course, if we were going to kill comments anyway, we don't 245 # need to worry about this 246 self.kill_conditional_comments(doc) 247 248 kill_tags = set(self.kill_tags or ()) 249 remove_tags = set(self.remove_tags or ()) 250 allow_tags = set(self.allow_tags or ()) 251 252 if self.scripts: 253 kill_tags.add('script') 254 if self.safe_attrs_only: 255 safe_attrs = set(self.safe_attrs) 256 for el in doc.iter(): 257 attrib = el.attrib 258 for aname in attrib.keys(): 259 if aname not in safe_attrs: 260 del attrib[aname] 261 if self.javascript: 262 if not (self.safe_attrs_only and 263 self.safe_attrs == defs.safe_attrs): 264 # safe_attrs handles events attributes itself 265 for el in doc.iter(): 266 attrib = el.attrib 267 for aname in attrib.keys(): 268 if aname.startswith('on'): 269 del attrib[aname] 270 doc.rewrite_links(self._remove_javascript_link, 271 resolve_base_href=False) 272 if not self.style: 273 # If we're deleting style then we don't have to remove JS links 274 # from styles, otherwise... 275 for el in _find_styled_elements(doc): 276 old = el.get('style') 277 new = _css_javascript_re.sub('', old) 278 new = _css_import_re.sub('', old) 279 if self._has_sneaky_javascript(new): 280 # Something tricky is going on... 281 del el.attrib['style'] 282 elif new != old: 283 el.set('style', new) 284 for el in list(doc.iter('style')): 285 if el.get('type', '').lower().strip() == 'text/javascript': 286 el.drop_tree() 287 continue 288 old = el.text or '' 289 new = _css_javascript_re.sub('', old) 290 # The imported CSS can do anything; we just can't allow: 291 new = _css_import_re.sub('', old) 292 if self._has_sneaky_javascript(new): 293 # Something tricky is going on... 294 el.text = '/* deleted */' 295 elif new != old: 296 el.text = new 297 if self.comments or self.processing_instructions: 298 # FIXME: why either? I feel like there's some obscure reason 299 # because you can put PIs in comments...? But I've already 300 # forgotten it 301 kill_tags.add(etree.Comment) 302 if self.processing_instructions: 303 kill_tags.add(etree.ProcessingInstruction) 304 if self.style: 305 kill_tags.add('style') 306 etree.strip_attributes(doc, 'style') 307 if self.links: 308 kill_tags.add('link') 309 elif self.style or self.javascript: 310 # We must get rid of included stylesheets if Javascript is not 311 # allowed, as you can put Javascript in them 312 for el in list(doc.iter('link')): 313 if 'stylesheet' in el.get('rel', '').lower(): 314 # Note this kills alternate stylesheets as well 315 el.drop_tree() 316 if self.meta: 317 kill_tags.add('meta') 318 if self.page_structure: 319 remove_tags.update(('head', 'html', 'title')) 320 if self.embedded: 321 # FIXME: is <layer> really embedded? 322 # We should get rid of any <param> tags not inside <applet>; 323 # These are not really valid anyway. 324 for el in list(doc.iter('param')): 325 found_parent = False 326 parent = el.getparent() 327 while parent is not None and parent.tag not in ('applet', 'object'): 328 parent = parent.getparent() 329 if parent is None: 330 el.drop_tree() 331 kill_tags.update(('applet',)) 332 # The alternate contents that are in an iframe are a good fallback: 333 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 334 if self.frames: 335 # FIXME: ideally we should look at the frame links, but 336 # generally frames don't mix properly with an HTML 337 # fragment anyway. 338 kill_tags.update(defs.frame_tags) 339 if self.forms: 340 remove_tags.add('form') 341 kill_tags.update(('button', 'input', 'select', 'textarea')) 342 if self.annoying_tags: 343 remove_tags.update(('blink', 'marquee')) 344 345 _remove = [] 346 _kill = [] 347 for el in doc.iter(): 348 if el.tag in kill_tags: 349 if self.allow_element(el): 350 continue 351 _kill.append(el) 352 elif el.tag in remove_tags: 353 if self.allow_element(el): 354 continue 355 _remove.append(el) 356 357 if _remove and _remove[0] == doc: 358 # We have to drop the parent-most tag, which we can't 359 # do. Instead we'll rewrite it: 360 el = _remove.pop(0) 361 el.tag = 'div' 362 el.attrib.clear() 363 elif _kill and _kill[0] == doc: 364 # We have to drop the parent-most element, which we can't 365 # do. Instead we'll clear it: 366 el = _kill.pop(0) 367 if el.tag != 'html': 368 el.tag = 'div' 369 el.clear() 370 371 _kill.reverse() # start with innermost tags 372 for el in _kill: 373 el.drop_tree() 374 for el in _remove: 375 el.drop_tag() 376 377 allow_tags = self.allow_tags 378 if self.remove_unknown_tags: 379 if allow_tags: 380 raise ValueError( 381 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 382 allow_tags = set(defs.tags) 383 if allow_tags: 384 bad = [] 385 for el in doc.iter(): 386 if el.tag not in allow_tags: 387 bad.append(el) 388 if bad: 389 if bad[0] is doc: 390 el = bad.pop(0) 391 el.tag = 'div' 392 el.attrib.clear() 393 for el in bad: 394 el.drop_tag() 395 if self.add_nofollow: 396 for el in _find_external_links(doc): 397 if not self.allow_follow(el): 398 el.set('rel', 'nofollow')

399

400 - def allow_follow(self, anchor):

401 """ 402 Override to suppress rel="nofollow" on some anchors. 403 """ 404 return False

405

406 - def allow_element(self, el):

407 if el.tag not in self._tag_link_attrs: 408 return False 409 attr = self._tag_link_attrs[el.tag] 410 if isinstance(attr, (list, tuple)): 411 for one_attr in attr: 412 url = el.get(one_attr) 413 if not url: 414 return False 415 if not self.allow_embedded_url(el, url): 416 return False 417 return True 418 else: 419 url = el.get(attr) 420 if not url: 421 return False 422 return self.allow_embedded_url(el, url)

423

424 - def allow_embedded_url(self, el, url):

425 if (self.whitelist_tags is not None 426 and el.tag not in self.whitelist_tags): 427 return False 428 scheme, netloc, path, query, fragment = urlsplit(url) 429 netloc = netloc.lower().split(':', 1)[0] 430 if scheme not in ('http', 'https'): 431 return False 432 if netloc in self.host_whitelist: 433 return True 434 return False

435

436 - def kill_conditional_comments(self, doc):

437 """ 438 IE conditional comments basically embed HTML that the parser 439 doesn't normally see. We can't allow anything like that, so 440 we'll kill any comments that could be conditional. 441 """ 442 bad = [] 443 self._kill_elements( 444 doc, lambda el: _conditional_comment_re.search(el.text), 445 etree.Comment)

446

447 - def _kill_elements(self, doc, condition, iterate=None):

448 bad = [] 449 for el in doc.iter(iterate): 450 if condition(el): 451 bad.append(el) 452 for el in bad: 453 el.drop_tree()

454

455 - def _remove_javascript_link(self, link):

456 # links like "j a v a s c r i p t:" might be interpreted in IE 457 new = _substitute_whitespace('', link) 458 if _javascript_scheme_re.search(new): 459 # FIXME: should this be None to delete? 460 return '' 461 return link

462 463 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 464

465 - def _has_sneaky_javascript(self, style):

466 """ 467 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 468 can get interpreted, or ``expre/* stuff */ssion(...)``. This 469 checks for attempt to do stuff like this. 470 471 Typically the response will be to kill the entire style; if you 472 have just a bit of Javascript in the style another rule will catch 473 that and remove only the Javascript from the style; this catches 474 more sneaky attempts. 475 """ 476 style = self._substitute_comments('', style) 477 style = style.replace('\\', '') 478 style = _substitute_whitespace('', style) 479 style = style.lower() 480 if 'javascript:' in style: 481 return True 482 if 'expression(' in style: 483 return True 484 return False

485

486 - def clean_html(self, html):

487 result_type = type(html) 488 if isinstance(html, basestring): 489 doc = fromstring(html) 490 else: 491 doc = copy.deepcopy(html) 492 self(doc) 493 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean