lxml.html.clean

100 """ 101 Instances cleans the document of each of the possible offending 102 elements. The cleaning is controlled by attributes; you can 103 override attributes in a subclass, or set them in the constructor. 104 105 ``scripts``: 106 Removes any ``<script>`` tags. 107 108 ``javascript``: 109 Removes any Javascript, like an ``onclick`` attribute. 110 111 ``comments``: 112 Removes any comments. 113 114 ``style``: 115 Removes any style tags or attributes. 116 117 ``links``: 118 Removes any ``<link>`` tags 119 120 ``meta``: 121 Removes any ``<meta>`` tags 122 123 ``page_structure``: 124 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 125 126 ``processing_instructions``: 127 Removes any processing instructions. 128 129 ``embedded``: 130 Removes any embedded objects (flash, iframes) 131 132 ``frames``: 133 Removes any frame-related tags 134 135 ``forms``: 136 Removes any form tags 137 138 ``annoying_tags``: 139 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 140 141 ``remove_tags``: 142 A list of tags to remove. 143 144 ``allow_tags``: 145 A list of tags to include (default include all). 146 147 ``remove_unknown_tags``: 148 Remove any tags that aren't standard parts of HTML. 149 150 ``safe_attrs_only``: 151 If true, only include 'safe' attributes (specifically the list 152 from `feedparser 153 <http://feedparser.org/docs/html-sanitization.html>`_). 154 155 ``add_nofollow``: 156 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 157 158 ``host_whitelist``: 159 A list or set of hosts that you can use for embedded content 160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 161 You can also implement/override the method 162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 163 implement more complex rules for what can be embedded. 164 Anything that passes this test will be shown, regardless of 165 the value of (for instance) ``embedded``. 166 167 Note that this parameter might not work as intended if you do not 168 make the links absolute before doing the cleaning. 169 170 ``whitelist_tags``: 171 A set of tags that can be included with ``host_whitelist``. 172 The default is ``iframe`` and ``embed``; you may wish to 173 include other tags like ``script``, or you may want to 174 implement ``allow_embedded_url`` for more control. Set to None to 175 include all tags. 176 177 This modifies the document *in place*. 178 """ 179 180 scripts = True 181 javascript = True 182 comments = True 183 style = False 184 links = True 185 meta = True 186 page_structure = True 187 processing_instructions = True 188 embedded = True 189 frames = True 190 forms = True 191 annoying_tags = True 192 remove_tags = None 193 allow_tags = None 194 remove_unknown_tags = True 195 safe_attrs_only = True 196 add_nofollow = False 197 host_whitelist = () 198 whitelist_tags = set(['iframe', 'embed']) 199

200 - def __init__(self, **kw):

201 for name, value in kw.items(): 202 if not hasattr(self, name): 203 raise TypeError( 204 "Unknown parameter: %s=%r" % (name, value)) 205 setattr(self, name, value)

206 207 # Used to lookup the primary URL for a given tag that is up for 208 # removal: 209 _tag_link_attrs = dict( 210 script='src', 211 link='href', 212 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 213 # From what I can tell, both attributes can contain a link: 214 applet=['code', 'object'], 215 iframe='src', 216 embed='src', 217 layer='src', 218 # FIXME: there doesn't really seem like a general way to figure out what 219 # links an <object> tag uses; links often go in <param> tags with values 220 # that we don't really know. You'd have to have knowledge about specific 221 # kinds of plugins (probably keyed off classid), and match against those. 222 ##object=?, 223 # FIXME: not looking at the action currently, because it is more complex 224 # than than -- if you keep the form, you should keep the form controls. 225 ##form='action', 226 a='href', 227 ) 228

229 - def __call__(self, doc):

230 """ 231 Cleans the document. 232 """ 233 if hasattr(doc, 'getroot'): 234 # ElementTree instance, instead of an element 235 doc = doc.getroot() 236 # convert XHTML to HTML 237 for el in doc.iter(): 238 tag = el.tag 239 if isinstance(tag, basestring): 240 el.tag = _nons(tag) 241 # Normalize a case that IE treats <image> like <img>, and that 242 # can confuse either this step or later steps. 243 for el in doc.iter('image'): 244 el.tag = 'img' 245 if not self.comments: 246 # Of course, if we were going to kill comments anyway, we don't 247 # need to worry about this 248 self.kill_conditional_comments(doc) 249 kill_tags = set() 250 remove_tags = set(self.remove_tags or ()) 251 if self.allow_tags: 252 allow_tags = set(self.allow_tags) 253 else: 254 allow_tags = set() 255 if self.scripts: 256 kill_tags.add('script') 257 if self.safe_attrs_only: 258 safe_attrs = set(defs.safe_attrs) 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname not in safe_attrs: 263 del attrib[aname] 264 if self.javascript: 265 if not self.safe_attrs_only: 266 # safe_attrs handles events attributes itself 267 for el in doc.iter(): 268 attrib = el.attrib 269 for aname in attrib.keys(): 270 if aname.startswith('on'): 271 del attrib[aname] 272 doc.rewrite_links(self._remove_javascript_link, 273 resolve_base_href=False) 274 if not self.style: 275 # If we're deleting style then we don't have to remove JS links 276 # from styles, otherwise... 277 for el in _find_styled_elements(doc): 278 old = el.get('style') 279 new = _css_javascript_re.sub('', old) 280 new = _css_import_re.sub('', old) 281 if self._has_sneaky_javascript(new): 282 # Something tricky is going on... 283 del el.attrib['style'] 284 elif new != old: 285 el.set('style', new) 286 for el in list(doc.iter('style')): 287 if el.get('type', '').lower().strip() == 'text/javascript': 288 el.drop_tree() 289 continue 290 old = el.text or '' 291 new = _css_javascript_re.sub('', old) 292 # The imported CSS can do anything; we just can't allow: 293 new = _css_import_re.sub('', old) 294 if self._has_sneaky_javascript(new): 295 # Something tricky is going on... 296 el.text = '/* deleted */' 297 elif new != old: 298 el.text = new 299 if self.comments or self.processing_instructions: 300 # FIXME: why either? I feel like there's some obscure reason 301 # because you can put PIs in comments...? But I've already 302 # forgotten it 303 kill_tags.add(etree.Comment) 304 if self.processing_instructions: 305 kill_tags.add(etree.ProcessingInstruction) 306 if self.style: 307 kill_tags.add('style') 308 for el in _find_styled_elements(doc): 309 del el.attrib['style'] 310 if self.links: 311 kill_tags.add('link') 312 elif self.style or self.javascript: 313 # We must get rid of included stylesheets if Javascript is not 314 # allowed, as you can put Javascript in them 315 for el in list(doc.iter('link')): 316 if 'stylesheet' in el.get('rel', '').lower(): 317 # Note this kills alternate stylesheets as well 318 el.drop_tree() 319 if self.meta: 320 kill_tags.add('meta') 321 if self.page_structure: 322 remove_tags.update(('head', 'html', 'title')) 323 if self.embedded: 324 # FIXME: is <layer> really embedded? 325 # We should get rid of any <param> tags not inside <applet>; 326 # These are not really valid anyway. 327 for el in list(doc.iter('param')): 328 found_parent = False 329 parent = el.getparent() 330 while parent is not None and parent.tag not in ('applet', 'object'): 331 parent = parent.getparent() 332 if parent is None: 333 el.drop_tree() 334 kill_tags.update(('applet',)) 335 # The alternate contents that are in an iframe are a good fallback: 336 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 337 if self.frames: 338 # FIXME: ideally we should look at the frame links, but 339 # generally frames don't mix properly with an HTML 340 # fragment anyway. 341 kill_tags.update(defs.frame_tags) 342 if self.forms: 343 remove_tags.add('form') 344 kill_tags.update(('button', 'input', 'select', 'textarea')) 345 if self.annoying_tags: 346 remove_tags.update(('blink', 'marque')) 347 348 _remove = [] 349 _kill = [] 350 for el in doc.iter(): 351 if el.tag in kill_tags: 352 if self.allow_element(el): 353 continue 354 _kill.append(el) 355 elif el.tag in remove_tags: 356 if self.allow_element(el): 357 continue 358 _remove.append(el) 359 360 if _remove and _remove[0] == doc: 361 # We have to drop the parent-most tag, which we can't 362 # do. Instead we'll rewrite it: 363 el = _remove.pop(0) 364 el.tag = 'div' 365 el.attrib.clear() 366 elif _kill and _kill[0] == doc: 367 # We have to drop the parent-most element, which we can't 368 # do. Instead we'll clear it: 369 el = _kill.pop(0) 370 if el.tag != 'html': 371 el.tag = 'div' 372 el.clear() 373 374 for el in _kill: 375 el.drop_tree() 376 for el in _remove: 377 el.drop_tag() 378 379 allow_tags = self.allow_tags 380 if self.remove_unknown_tags: 381 if allow_tags: 382 raise ValueError( 383 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 384 allow_tags = set(defs.tags) 385 if allow_tags: 386 bad = [] 387 for el in doc.iter(): 388 if el.tag not in allow_tags: 389 bad.append(el) 390 for el in bad: 391 el.drop_tag() 392 if self.add_nofollow: 393 for el in _find_external_links(doc): 394 if not self.allow_follow(el): 395 el.set('rel', 'nofollow')

396

397 - def allow_follow(self, anchor):

398 """ 399 Override to suppress rel="nofollow" on some anchors. 400 """ 401 return False

402

403 - def allow_element(self, el):

404 if el.tag not in self._tag_link_attrs: 405 return False 406 attr = self._tag_link_attrs[el.tag] 407 if isinstance(attr, (list, tuple)): 408 for one_attr in attr: 409 url = el.get(one_attr) 410 if not url: 411 return False 412 if not self.allow_embedded_url(el, url): 413 return False 414 return True 415 else: 416 url = el.get(attr) 417 if not url: 418 return False 419 return self.allow_embedded_url(el, url)

420

421 - def allow_embedded_url(self, el, url):

422 if (self.whitelist_tags is not None 423 and el.tag not in self.whitelist_tags): 424 return False 425 scheme, netloc, path, query, fragment = urlsplit(url) 426 netloc = netloc.lower().split(':', 1)[0] 427 if scheme not in ('http', 'https'): 428 return False 429 if netloc in self.host_whitelist: 430 return True 431 return False

432

433 - def kill_conditional_comments(self, doc):

434 """ 435 IE conditional comments basically embed HTML that the parser 436 doesn't normally see. We can't allow anything like that, so 437 we'll kill any comments that could be conditional. 438 """ 439 bad = [] 440 self._kill_elements( 441 doc, lambda el: _conditional_comment_re.search(el.text), 442 etree.Comment)

443

444 - def _kill_elements(self, doc, condition, iterate=None):

445 bad = [] 446 for el in doc.iter(iterate): 447 if condition(el): 448 bad.append(el) 449 for el in bad: 450 el.drop_tree()

451

452 - def _remove_javascript_link(self, link):

453 # links like "j a v a s c r i p t:" might be interpreted in IE 454 new = _substitute_whitespace('', link) 455 if _javascript_scheme_re.search(new): 456 # FIXME: should this be None to delete? 457 return '' 458 return link

459 460 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 461

462 - def _has_sneaky_javascript(self, style):

463 """ 464 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 465 can get interpreted, or ``expre/* stuff */ssion(...)``. This 466 checks for attempt to do stuff like this. 467 468 Typically the response will be to kill the entire style; if you 469 have just a bit of Javascript in the style another rule will catch 470 that and remove only the Javascript from the style; this catches 471 more sneaky attempts. 472 """ 473 style = self._substitute_comments('', style) 474 style = style.replace('\\', '') 475 style = _substitute_whitespace('', style) 476 style = style.lower() 477 if 'javascript:' in style: 478 return True 479 if 'expression(' in style: 480 return True 481 return False

482

483 - def clean_html(self, html):

484 result_type = type(html) 485 if isinstance(html, basestring): 486 doc = fromstring(html) 487 else: 488 doc = copy.deepcopy(html) 489 self(doc) 490 return _transform_result(result_type, doc)

Source Code for Module lxml.html.clean