lxml.html.clean

103 """ 104 Instances cleans the document of each of the possible offending 105 elements. The cleaning is controlled by attributes; you can 106 override attributes in a subclass, or set them in the constructor. 107 108 ``scripts``: 109 Removes any ``<script>`` tags. 110 111 ``javascript``: 112 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 113 as they could contain Javascript. 114 115 ``comments``: 116 Removes any comments. 117 118 ``style``: 119 Removes any style tags. 120 121 ``inline_style`` 122 Removes any style attributes. Defaults to the value of the ``style`` option. 123 124 ``links``: 125 Removes any ``<link>`` tags 126 127 ``meta``: 128 Removes any ``<meta>`` tags 129 130 ``page_structure``: 131 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 132 133 ``processing_instructions``: 134 Removes any processing instructions. 135 136 ``embedded``: 137 Removes any embedded objects (flash, iframes) 138 139 ``frames``: 140 Removes any frame-related tags 141 142 ``forms``: 143 Removes any form tags 144 145 ``annoying_tags``: 146 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 147 148 ``remove_tags``: 149 A list of tags to remove. Only the tags will be removed, 150 their content will get pulled up into the parent tag. 151 152 ``kill_tags``: 153 A list of tags to kill. Killing also removes the tag's content, 154 i.e. the whole subtree, not just the tag itself. 155 156 ``allow_tags``: 157 A list of tags to include (default include all). 158 159 ``remove_unknown_tags``: 160 Remove any tags that aren't standard parts of HTML. 161 162 ``safe_attrs_only``: 163 If true, only include 'safe' attributes (specifically the list 164 from the feedparser HTML sanitisation web site). 165 166 ``safe_attrs``: 167 A set of attribute names to override the default list of attributes 168 considered 'safe' (when safe_attrs_only=True). 169 170 ``add_nofollow``: 171 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 172 173 ``host_whitelist``: 174 A list or set of hosts that you can use for embedded content 175 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 176 You can also implement/override the method 177 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 178 implement more complex rules for what can be embedded. 179 Anything that passes this test will be shown, regardless of 180 the value of (for instance) ``embedded``. 181 182 Note that this parameter might not work as intended if you do not 183 make the links absolute before doing the cleaning. 184 185 Note that you may also need to set ``whitelist_tags``. 186 187 ``whitelist_tags``: 188 A set of tags that can be included with ``host_whitelist``. 189 The default is ``iframe`` and ``embed``; you may wish to 190 include other tags like ``script``, or you may want to 191 implement ``allow_embedded_url`` for more control. Set to None to 192 include all tags. 193 194 This modifies the document *in place*. 195 """ 196 197 scripts = True 198 javascript = True 199 comments = True 200 style = False 201 inline_style = None 202 links = True 203 meta = True 204 page_structure = True 205 processing_instructions = True 206 embedded = True 207 frames = True 208 forms = True 209 annoying_tags = True 210 remove_tags = None 211 allow_tags = None 212 kill_tags = None 213 remove_unknown_tags = True 214 safe_attrs_only = True 215 safe_attrs = defs.safe_attrs 216 add_nofollow = False 217 host_whitelist = () 218 whitelist_tags = set(['iframe', 'embed']) 219

220 - def __init__(self, **kw):

221 for name, value in kw.items(): 222 if not hasattr(self, name): 223 raise TypeError( 224 "Unknown parameter: %s=%r" % (name, value)) 225 setattr(self, name, value) 226 if self.inline_style is None and 'inline_style' not in kw: 227 self.inline_style = self.style

228 229 # Used to lookup the primary URL for a given tag that is up for 230 # removal: 231 _tag_link_attrs = dict( 232 script='src', 233 link='href', 234 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 235 # From what I can tell, both attributes can contain a link: 236 applet=['code', 'object'], 237 iframe='src', 238 embed='src', 239 layer='src', 240 # FIXME: there doesn't really seem like a general way to figure out what 241 # links an <object> tag uses; links often go in <param> tags with values 242 # that we don't really know. You'd have to have knowledge about specific 243 # kinds of plugins (probably keyed off classid), and match against those. 244 ##object=?, 245 # FIXME: not looking at the action currently, because it is more complex 246 # than than -- if you keep the form, you should keep the form controls. 247 ##form='action', 248 a='href', 249 ) 250

251 - def __call__(self, doc):

252 """ 253 Cleans the document. 254 """ 255 if hasattr(doc, 'getroot'): 256 # ElementTree instance, instead of an element 257 doc = doc.getroot() 258 # convert XHTML to HTML 259 xhtml_to_html(doc) 260 # Normalize a case that IE treats <image> like <img>, and that 261 # can confuse either this step or later steps. 262 for el in doc.iter('image'): 263 el.tag = 'img' 264 if not self.comments: 265 # Of course, if we were going to kill comments anyway, we don't 266 # need to worry about this 267 self.kill_conditional_comments(doc) 268 269 kill_tags = set(self.kill_tags or ()) 270 remove_tags = set(self.remove_tags or ()) 271 allow_tags = set(self.allow_tags or ()) 272 273 if self.scripts: 274 kill_tags.add('script') 275 if self.safe_attrs_only: 276 safe_attrs = set(self.safe_attrs) 277 for el in doc.iter(etree.Element): 278 attrib = el.attrib 279 for aname in attrib.keys(): 280 if aname not in safe_attrs: 281 del attrib[aname] 282 if self.javascript: 283 if not (self.safe_attrs_only and 284 self.safe_attrs == defs.safe_attrs): 285 # safe_attrs handles events attributes itself 286 for el in doc.iter(etree.Element): 287 attrib = el.attrib 288 for aname in attrib.keys(): 289 if aname.startswith('on'): 290 del attrib[aname] 291 doc.rewrite_links(self._remove_javascript_link, 292 resolve_base_href=False) 293 # If we're deleting style then we don't have to remove JS links 294 # from styles, otherwise... 295 if not self.inline_style: 296 for el in _find_styled_elements(doc): 297 old = el.get('style') 298 new = _css_javascript_re.sub('', old) 299 new = _css_import_re.sub('', new) 300 if self._has_sneaky_javascript(new): 301 # Something tricky is going on... 302 del el.attrib['style'] 303 elif new != old: 304 el.set('style', new) 305 if not self.style: 306 for el in list(doc.iter('style')): 307 if el.get('type', '').lower().strip() == 'text/javascript': 308 el.drop_tree() 309 continue 310 old = el.text or '' 311 new = _css_javascript_re.sub('', old) 312 # The imported CSS can do anything; we just can't allow: 313 new = _css_import_re.sub('', old) 314 if self._has_sneaky_javascript(new): 315 # Something tricky is going on... 316 el.text = '/* deleted */' 317 elif new != old: 318 el.text = new 319 if self.comments or self.processing_instructions: 320 # FIXME: why either? I feel like there's some obscure reason 321 # because you can put PIs in comments...? But I've already 322 # forgotten it 323 kill_tags.add(etree.Comment) 324 if self.processing_instructions: 325 kill_tags.add(etree.ProcessingInstruction) 326 if self.style: 327 kill_tags.add('style') 328 if self.inline_style: 329 etree.strip_attributes(doc, 'style') 330 if self.links: 331 kill_tags.add('link') 332 elif self.style or self.javascript: 333 # We must get rid of included stylesheets if Javascript is not 334 # allowed, as you can put Javascript in them 335 for el in list(doc.iter('link')): 336 if 'stylesheet' in el.get('rel', '').lower(): 337 # Note this kills alternate stylesheets as well 338 if not self.allow_element(el): 339 el.drop_tree() 340 if self.meta: 341 kill_tags.add('meta') 342 if self.page_structure: 343 remove_tags.update(('head', 'html', 'title')) 344 if self.embedded: 345 # FIXME: is <layer> really embedded? 346 # We should get rid of any <param> tags not inside <applet>; 347 # These are not really valid anyway. 348 for el in list(doc.iter('param')): 349 found_parent = False 350 parent = el.getparent() 351 while parent is not None and parent.tag not in ('applet', 'object'): 352 parent = parent.getparent() 353 if parent is None: 354 el.drop_tree() 355 kill_tags.update(('applet',)) 356 # The alternate contents that are in an iframe are a good fallback: 357 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 358 if self.frames: 359 # FIXME: ideally we should look at the frame links, but 360 # generally frames don't mix properly with an HTML 361 # fragment anyway. 362 kill_tags.update(defs.frame_tags) 363 if self.forms: 364 remove_tags.add('form') 365 kill_tags.update(('button', 'input', 'select', 'textarea')) 366 if self.annoying_tags: 367 remove_tags.update(('blink', 'marquee')) 368 369 _remove = [] 370 _kill = [] 371 for el in doc.iter(): 372 if el.tag in kill_tags: 373 if self.allow_element(el): 374 continue 375 _kill.append(el) 376 elif el.tag in remove_tags: 377 if self.allow_element(el): 378 continue 379 _remove.append(el) 380 381 if _remove and _remove[0] == doc: 382 # We have to drop the parent-most tag, which we can't 383 # do. Instead we'll rewrite it: 384 el = _remove.pop(0) 385 el.tag = 'div' 386 el.attrib.clear() 387 elif _kill and _kill[0] == doc: 388 # We have to drop the parent-most element, which we can't 389 # do. Instead we'll clear it: 390 el = _kill.pop(0) 391 if el.tag != 'html': 392 el.tag = 'div' 393 el.clear() 394 395 _kill.reverse() # start with innermost tags 396 for el in _kill: 397 el.drop_tree() 398 for el in _remove: 399 el.drop_tag() 400 401 if self.remove_unknown_tags: 402 if allow_tags: 403 raise ValueError( 404 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 405 allow_tags = set(defs.tags) 406 if allow_tags: 407 bad = [] 408 for el in doc.iter(): 409 if el.tag not in allow_tags: 410 bad.append(el) 411 if bad: 412 if bad[0] is doc: 413 el = bad.pop(0) 414 el.tag = 'div' 415 el.attrib.clear() 416 for el in bad: 417 el.drop_tag() 418 if self.add_nofollow: 419 for el in _find_external_links(doc): 420 if not self.allow_follow(el): 421 rel = el.get('rel') 422 if rel: 423 if ('nofollow' in rel 424 and ' nofollow ' in (' %s ' % rel)): 425 continue 426 rel = '%s nofollow' % rel 427 else: 428 rel = 'nofollow' 429 el.set('rel', rel)

430

431 - def allow_follow(self, anchor):

432 """ 433 Override to suppress rel="nofollow" on some anchors. 434 """ 435 return False

436

437 - def allow_element(self, el):

438 if el.tag not in self._tag_link_attrs: 439 return False 440 attr = self._tag_link_attrs[el.tag] 441 if isinstance(attr, (list, tuple)): 442 for one_attr in attr: 443 url = el.get(one_attr) 444 if not url: 445 return False 446 if not self.allow_embedded_url(el, url): 447 return False 448 return True 449 else: 450 url = el.get(attr) 451 if not url: 452 return False 453 return self.allow_embedded_url(el, url)

454

455 - def allow_embedded_url(self, el, url):

456 if (self.whitelist_tags is not None 457 and el.tag not in self.whitelist_tags): 458 return False 459 scheme, netloc, path, query, fragment = urlsplit(url) 460 netloc = netloc.lower().split(':', 1)[0] 461 if scheme not in ('http', 'https'): 462 return False 463 if netloc in self.host_whitelist: 464 return True 465 return False

466

467 - def kill_conditional_comments(self, doc):

468 """ 469 IE conditional comments basically embed HTML that the parser 470 doesn't normally see. We can't allow anything like that, so 471 we'll kill any comments that could be conditional. 472 """ 473 bad = [] 474 self._kill_elements( 475 doc, lambda el: _conditional_comment_re.search(el.text), 476 etree.Comment)

477

478 - def _kill_elements(self, doc, condition, iterate=None):

479 bad = [] 480 for el in doc.iter(iterate): 481 if condition(el): 482 bad.append(el) 483 for el in bad: 484 el.drop_tree()

485

486 - def _remove_javascript_link(self, link):

487 # links like "j a v a s c r i p t:" might be interpreted in IE 488 new = _substitute_whitespace('', unquote_plus(link)) 489 if _is_javascript_scheme(new): 490 # FIXME: should this be None to delete? 491 return '' 492 return link

493 494 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 495

496 - def _has_sneaky_javascript(self, style):

497 """ 498 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 499 can get interpreted, or ``expre/* stuff */ssion(...)``. This 500 checks for attempt to do stuff like this. 501 502 Typically the response will be to kill the entire style; if you 503 have just a bit of Javascript in the style another rule will catch 504 that and remove only the Javascript from the style; this catches 505 more sneaky attempts. 506 """ 507 style = self._substitute_comments('', style) 508 style = style.replace('\\', '') 509 style = _substitute_whitespace('', style) 510 style = style.lower() 511 if 'javascript:' in style: 512 return True 513 if 'expression(' in style: 514 return True 515 return False

516

517 - def clean_html(self, html):

518 result_type = type(html) 519 if isinstance(html, basestring): 520 doc = fromstring(html) 521 else: 522 doc = copy.deepcopy(html) 523 self(doc) 524 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean