lxml.html.clean

100 """ 101 Instances cleans the document of each of the possible offending 102 elements. The cleaning is controlled by attributes; you can 103 override attributes in a subclass, or set them in the constructor. 104 105 ``scripts``: 106 Removes any ``<script>`` tags. 107 108 ``javascript``: 109 Removes any Javascript, like an ``onclick`` attribute. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 140 141 ``remove_tags``: 142 A list of tags to remove. Only the tags will be removed, 143 their content will get pulled up into the parent tag. 144 145 ``kill_tags``: 146 A list of tags to kill. Killing also removes the tag's content, 147 i.e. the whole subtree, not just the tag itself. 148 149 ``allow_tags``: 150 A list of tags to include (default include all). 151 152 ``remove_unknown_tags``: 153 Remove any tags that aren't standard parts of HTML. 154 155 ``safe_attrs_only``: 156 If true, only include 'safe' attributes (specifically the list 157 from `feedparser 158 <http://feedparser.org/docs/html-sanitization.html>`_). 159 160 ``add_nofollow``: 161 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 162 163 ``host_whitelist``: 164 A list or set of hosts that you can use for embedded content 165 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 166 You can also implement/override the method 167 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 168 implement more complex rules for what can be embedded. 169 Anything that passes this test will be shown, regardless of 170 the value of (for instance) ``embedded``. 171 172 Note that this parameter might not work as intended if you do not 173 make the links absolute before doing the cleaning. 174 175 ``whitelist_tags``: 176 A set of tags that can be included with ``host_whitelist``. 177 The default is ``iframe`` and ``embed``; you may wish to 178 include other tags like ``script``, or you may want to 179 implement ``allow_embedded_url`` for more control. Set to None to 180 include all tags. 181 182 This modifies the document *in place*. 183 """ 184 185 scripts = True 186 javascript = True 187 comments = True 188 style = False 189 links = True 190 meta = True 191 page_structure = True 192 processing_instructions = True 193 embedded = True 194 frames = True 195 forms = True 196 annoying_tags = True 197 remove_tags = None 198 allow_tags = None 199 kill_tags = None 200 remove_unknown_tags = True 201 safe_attrs_only = True 202 add_nofollow = False 203 host_whitelist = () 204 whitelist_tags = set(['iframe', 'embed']) 205

206 - def __init__(self, **kw):

207 for name, value in kw.items(): 208 if not hasattr(self, name): 209 raise TypeError( 210 "Unknown parameter: %s=%r" % (name, value)) 211 setattr(self, name, value)

212 213 # Used to lookup the primary URL for a given tag that is up for 214 # removal: 215 _tag_link_attrs = dict( 216 script='src', 217 link='href', 218 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 219 # From what I can tell, both attributes can contain a link: 220 applet=['code', 'object'], 221 iframe='src', 222 embed='src', 223 layer='src', 224 # FIXME: there doesn't really seem like a general way to figure out what 225 # links an <object> tag uses; links often go in <param> tags with values 226 # that we don't really know. You'd have to have knowledge about specific 227 # kinds of plugins (probably keyed off classid), and match against those. 228 ##object=?, 229 # FIXME: not looking at the action currently, because it is more complex 230 # than than -- if you keep the form, you should keep the form controls. 231 ##form='action', 232 a='href', 233 ) 234

235 - def __call__(self, doc):

236 """ 237 Cleans the document. 238 """ 239 if hasattr(doc, 'getroot'): 240 # ElementTree instance, instead of an element 241 doc = doc.getroot() 242 # convert XHTML to HTML 243 xhtml_to_html(doc) 244 # Normalize a case that IE treats <image> like <img>, and that 245 # can confuse either this step or later steps. 246 for el in doc.iter('image'): 247 el.tag = 'img' 248 if not self.comments: 249 # Of course, if we were going to kill comments anyway, we don't 250 # need to worry about this 251 self.kill_conditional_comments(doc) 252 253 kill_tags = set(self.kill_tags or ()) 254 remove_tags = set(self.remove_tags or ()) 255 allow_tags = set(self.allow_tags or ()) 256 257 if self.scripts: 258 kill_tags.add('script') 259 if self.safe_attrs_only: 260 safe_attrs = set(defs.safe_attrs) 261 for el in doc.iter(): 262 attrib = el.attrib 263 for aname in attrib.keys(): 264 if aname not in safe_attrs: 265 del attrib[aname] 266 if self.javascript: 267 if not self.safe_attrs_only: 268 # safe_attrs handles events attributes itself 269 for el in doc.iter(): 270 attrib = el.attrib 271 for aname in attrib.keys(): 272 if aname.startswith('on'): 273 del attrib[aname] 274 doc.rewrite_links(self._remove_javascript_link, 275 resolve_base_href=False) 276 if not self.style: 277 # If we're deleting style then we don't have to remove JS links 278 # from styles, otherwise... 279 for el in _find_styled_elements(doc): 280 old = el.get('style') 281 new = _css_javascript_re.sub('', old) 282 new = _css_import_re.sub('', old) 283 if self._has_sneaky_javascript(new): 284 # Something tricky is going on... 285 del el.attrib['style'] 286 elif new != old: 287 el.set('style', new) 288 for el in list(doc.iter('style')): 289 if el.get('type', '').lower().strip() == 'text/javascript': 290 el.drop_tree() 291 continue 292 old = el.text or '' 293 new = _css_javascript_re.sub('', old) 294 # The imported CSS can do anything; we just can't allow: 295 new = _css_import_re.sub('', old) 296 if self._has_sneaky_javascript(new): 297 # Something tricky is going on... 298 el.text = '/* deleted */' 299 elif new != old: 300 el.text = new 301 if self.comments or self.processing_instructions: 302 # FIXME: why either? I feel like there's some obscure reason 303 # because you can put PIs in comments...? But I've already 304 # forgotten it 305 kill_tags.add(etree.Comment) 306 if self.processing_instructions: 307 kill_tags.add(etree.ProcessingInstruction) 308 if self.style: 309 kill_tags.add('style') 310 etree.strip_attributes(doc, 'style') 311 if self.links: 312 kill_tags.add('link') 313 elif self.style or self.javascript: 314 # We must get rid of included stylesheets if Javascript is not 315 # allowed, as you can put Javascript in them 316 for el in list(doc.iter('link')): 317 if 'stylesheet' in el.get('rel', '').lower(): 318 # Note this kills alternate stylesheets as well 319 el.drop_tree() 320 if self.meta: 321 kill_tags.add('meta') 322 if self.page_structure: 323 remove_tags.update(('head', 'html', 'title')) 324 if self.embedded: 325 # FIXME: is <layer> really embedded? 326 # We should get rid of any <param> tags not inside <applet>; 327 # These are not really valid anyway. 328 for el in list(doc.iter('param')): 329 found_parent = False 330 parent = el.getparent() 331 while parent is not None and parent.tag not in ('applet', 'object'): 332 parent = parent.getparent() 333 if parent is None: 334 el.drop_tree() 335 kill_tags.update(('applet',)) 336 # The alternate contents that are in an iframe are a good fallback: 337 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 338 if self.frames: 339 # FIXME: ideally we should look at the frame links, but 340 # generally frames don't mix properly with an HTML 341 # fragment anyway. 342 kill_tags.update(defs.frame_tags) 343 if self.forms: 344 remove_tags.add('form') 345 kill_tags.update(('button', 'input', 'select', 'textarea')) 346 if self.annoying_tags: 347 remove_tags.update(('blink', 'marquee')) 348 349 _remove = [] 350 _kill = [] 351 for el in doc.iter(): 352 if el.tag in kill_tags: 353 if self.allow_element(el): 354 continue 355 _kill.append(el) 356 elif el.tag in remove_tags: 357 if self.allow_element(el): 358 continue 359 _remove.append(el) 360 361 if _remove and _remove[0] == doc: 362 # We have to drop the parent-most tag, which we can't 363 # do. Instead we'll rewrite it: 364 el = _remove.pop(0) 365 el.tag = 'div' 366 el.attrib.clear() 367 elif _kill and _kill[0] == doc: 368 # We have to drop the parent-most element, which we can't 369 # do. Instead we'll clear it: 370 el = _kill.pop(0) 371 if el.tag != 'html': 372 el.tag = 'div' 373 el.clear() 374 375 _kill.reverse() # start with innermost tags 376 for el in _kill: 377 el.drop_tree() 378 for el in _remove: 379 el.drop_tag() 380 381 allow_tags = self.allow_tags 382 if self.remove_unknown_tags: 383 if allow_tags: 384 raise ValueError( 385 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 386 allow_tags = set(defs.tags) 387 if allow_tags: 388 bad = [] 389 for el in doc.iter(): 390 if el.tag not in allow_tags: 391 bad.append(el) 392 if bad: 393 if bad[0] is doc: 394 el = bad.pop(0) 395 el.tag = 'div' 396 el.attrib.clear() 397 for el in bad: 398 el.drop_tag() 399 if self.add_nofollow: 400 for el in _find_external_links(doc): 401 if not self.allow_follow(el): 402 el.set('rel', 'nofollow')

403

404 - def allow_follow(self, anchor):

405 """ 406 Override to suppress rel="nofollow" on some anchors. 407 """ 408 return False

409

410 - def allow_element(self, el):

411 if el.tag not in self._tag_link_attrs: 412 return False 413 attr = self._tag_link_attrs[el.tag] 414 if isinstance(attr, (list, tuple)): 415 for one_attr in attr: 416 url = el.get(one_attr) 417 if not url: 418 return False 419 if not self.allow_embedded_url(el, url): 420 return False 421 return True 422 else: 423 url = el.get(attr) 424 if not url: 425 return False 426 return self.allow_embedded_url(el, url)

427

428 - def allow_embedded_url(self, el, url):

429 if (self.whitelist_tags is not None 430 and el.tag not in self.whitelist_tags): 431 return False 432 scheme, netloc, path, query, fragment = urlsplit(url) 433 netloc = netloc.lower().split(':', 1)[0] 434 if scheme not in ('http', 'https'): 435 return False 436 if netloc in self.host_whitelist: 437 return True 438 return False

439

440 - def kill_conditional_comments(self, doc):

441 """ 442 IE conditional comments basically embed HTML that the parser 443 doesn't normally see. We can't allow anything like that, so 444 we'll kill any comments that could be conditional. 445 """ 446 bad = [] 447 self._kill_elements( 448 doc, lambda el: _conditional_comment_re.search(el.text), 449 etree.Comment)

450

451 - def _kill_elements(self, doc, condition, iterate=None):

452 bad = [] 453 for el in doc.iter(iterate): 454 if condition(el): 455 bad.append(el) 456 for el in bad: 457 el.drop_tree()

458

459 - def _remove_javascript_link(self, link):

460 # links like "j a v a s c r i p t:" might be interpreted in IE 461 new = _substitute_whitespace('', link) 462 if _javascript_scheme_re.search(new): 463 # FIXME: should this be None to delete? 464 return '' 465 return link

466 467 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 468

469 - def _has_sneaky_javascript(self, style):

470 """ 471 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 472 can get interpreted, or ``expre/* stuff */ssion(...)``. This 473 checks for attempt to do stuff like this. 474 475 Typically the response will be to kill the entire style; if you 476 have just a bit of Javascript in the style another rule will catch 477 that and remove only the Javascript from the style; this catches 478 more sneaky attempts. 479 """ 480 style = self._substitute_comments('', style) 481 style = style.replace('\\', '') 482 style = _substitute_whitespace('', style) 483 style = style.lower() 484 if 'javascript:' in style: 485 return True 486 if 'expression(' in style: 487 return True 488 return False

489

490 - def clean_html(self, html):

491 result_type = type(html) 492 if isinstance(html, basestring): 493 doc = fromstring(html) 494 else: 495 doc = copy.deepcopy(html) 496 self(doc) 497 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean