1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 import urlparse
10 from lxml import etree
11 from lxml.html import defs
12 from lxml.html import fromstring, tostring
13
14 try:
15 set
16 except NameError:
17 from sets import Set as set
18
19 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
20 'word_break', 'word_break_html']
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43 _css_javascript_re = re.compile(
44 r'expression\s*\(.*?\)', re.S|re.I)
45
46
47 _css_import_re = re.compile(
48 r'@\s*import', re.I)
49
50
51
52 _javascript_scheme_re = re.compile(
53 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
54 _substitute_whitespace = re.compile(r'\s+').sub
55
56
57
58 _conditional_comment_re = re.compile(
59 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
60
61 _find_styled_elements = etree.XPath(
62 "descendant-or-self::*[@style]")
63
64 _find_external_links = etree.XPath(
65 "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
66
68 """
69 Instances cleans the document of each of the possible offending
70 elements. The cleaning is controlled by attributes; you can
71 override attributes in a subclass, or set them in the constructor.
72
73 ``scripts``:
74 Removes any ``<script>`` tags.
75
76 ``javascript``:
77 Removes any Javascript, like an ``onclick`` attribute.
78
79 ``comments``:
80 Removes any comments.
81
82 ``style``:
83 Removes any style tags or attributes.
84
85 ``links``:
86 Removes any ``<link>`` tags
87
88 ``meta``:
89 Removes any ``<meta>`` tags
90
91 ``page_structure``:
92 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
93
94 ``processing_instructions``:
95 Removes any processing instructions.
96
97 ``embedded``:
98 Removes any embedded objects (flash, iframes)
99
100 ``frames``:
101 Removes any frame-related tags
102
103 ``forms``:
104 Removes any form tags
105
106 ``annoying_tags``:
107 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
108
109 ``remove_tags``:
110 A list of tags to remove.
111
112 ``allow_tags``:
113 A list of tags to include (default include all).
114
115 ``remove_unknown_tags``:
116 Remove any tags that aren't standard parts of HTML.
117
118 ``safe_attrs_only``:
119 If true, only include 'safe' attributes (specifically the list
120 from `feedparser
121 <http://feedparser.org/docs/html-sanitization.html>`_).
122
123 ``add_nofollow``:
124 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
125
126 ``host_whitelist``:
127 A list or set of hosts that you can use for embedded content
128 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
129 You can also implement/override the method
130 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
131 implement more complex rules for what can be embedded.
132 Anything that passes this test will be shown, regardless of
133 the value of (for instance) ``embedded``.
134
135 Note that this parameter might not work as intended if you do not
136 make the links absolute before doing the cleaning.
137
138 ``whitelist_tags``:
139 A set of tags that can be included with ``host_whitelist``.
140 The default is ``iframe`` and ``embed``; you may wish to
141 include other tags like ``script``, or you may want to
142 implement ``allow_embedded_url`` for more control. Set to None to
143 include all tags.
144
145 This modifies the document *in place*.
146 """
147
148 scripts = True
149 javascript = True
150 comments = True
151 style = False
152 links = True
153 meta = True
154 page_structure = True
155 processing_instructions = True
156 embedded = True
157 frames = True
158 forms = True
159 annoying_tags = True
160 remove_tags = None
161 allow_tags = None
162 remove_unknown_tags = True
163 safe_attrs_only = True
164 add_nofollow = False
165 host_whitelist = ()
166 whitelist_tags = set(['iframe', 'embed'])
167
169 for name, value in kw.items():
170 if not hasattr(self, name):
171 raise TypeError(
172 "Unknown parameter: %s=%r" % (name, value))
173 setattr(self, name, value)
174
175
176
177 _tag_link_attrs = dict(
178 script='src',
179 link='href',
180
181
182 applet=['code', 'object'],
183 iframe='src',
184 embed='src',
185 layer='src',
186
187
188
189
190
191
192
193
194 a='href',
195 )
196
198 """
199 Cleans the document.
200 """
201 if hasattr(doc, 'getroot'):
202
203 doc = doc.getroot()
204
205
206 for el in doc.iter('image'):
207 el.tag = 'img'
208 if not self.comments:
209
210
211 self.kill_conditional_comments(doc)
212 kill_tags = set()
213 remove_tags = set(self.remove_tags or ())
214 if self.allow_tags:
215 allow_tags = set(self.allow_tags)
216 else:
217 allow_tags = set()
218 if self.scripts:
219 kill_tags.add('script')
220 if self.safe_attrs_only:
221 safe_attrs = set(defs.safe_attrs)
222 for el in doc.iter():
223 attrib = el.attrib
224 for aname in attrib.keys():
225 if aname not in safe_attrs:
226 del attrib[aname]
227 if self.javascript:
228 if not self.safe_attrs_only:
229
230 for el in doc.iter():
231 attrib = el.attrib
232 for aname in attrib.keys():
233 if aname.startswith('on'):
234 del attrib[aname]
235 doc.rewrite_links(self._remove_javascript_link,
236 resolve_base_href=False)
237 if not self.style:
238
239
240 for el in _find_styled_elements(doc):
241 old = el.get('style')
242 new = _css_javascript_re.sub('', old)
243 new = _css_import_re.sub('', old)
244 if self._has_sneaky_javascript(new):
245
246 del el.attrib['style']
247 elif new != old:
248 el.set('style', new)
249 for el in list(doc.iter('style')):
250 if el.get('type', '').lower().strip() == 'text/javascript':
251 el.drop_tree()
252 continue
253 old = el.text or ''
254 new = _css_javascript_re.sub('', old)
255
256 new = _css_import_re.sub('', old)
257 if self._has_sneaky_javascript(new):
258
259 el.text = '/* deleted */'
260 elif new != old:
261 el.text = new
262 if self.comments or self.processing_instructions:
263
264
265
266 kill_tags.add(etree.Comment)
267 if self.processing_instructions:
268 kill_tags.add(etree.ProcessingInstruction)
269 if self.style:
270 kill_tags.add('style')
271 for el in _find_styled_elements(doc):
272 del el.attrib['style']
273 if self.links:
274 kill_tags.add('link')
275 elif self.style or self.javascript:
276
277
278 for el in list(doc.iter('link')):
279 if 'stylesheet' in el.get('rel', '').lower():
280
281 el.drop_tree()
282 if self.meta:
283 kill_tags.add('meta')
284 if self.page_structure:
285 remove_tags.update(('head', 'html', 'title'))
286 if self.embedded:
287
288
289
290 for el in list(doc.iter('param')):
291 found_parent = False
292 parent = el.getparent()
293 while parent is not None and parent.tag not in ('applet', 'object'):
294 parent = parent.getparent()
295 if parent is None:
296 el.drop_tree()
297 kill_tags.update(('applet',))
298
299 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
300 if self.frames:
301
302
303
304 kill_tags.update(defs.frame_tags)
305 if self.forms:
306 remove_tags.add('form')
307 kill_tags.update(('button', 'input', 'select', 'textarea'))
308 if self.annoying_tags:
309 remove_tags.update(('blink', 'marque'))
310
311 _remove = []
312 _kill = []
313 for el in doc.iter():
314 if el.tag in kill_tags:
315 if self.allow_element(el):
316 continue
317 _kill.append(el)
318 elif el.tag in remove_tags:
319 if self.allow_element(el):
320 continue
321 _remove.append(el)
322
323 if _remove and _remove[0] == doc:
324
325
326 el = _remove.pop(0)
327 el.tag = 'div'
328 el.attrib.clear()
329 elif _kill and _kill[0] == doc:
330
331
332 el = _kill.pop(0)
333 if el.tag != 'html':
334 el.tag = 'div'
335 el.clear()
336
337 for el in _kill:
338 el.drop_tree()
339 for el in _remove:
340 el.drop_tag()
341
342 allow_tags = self.allow_tags
343 if self.remove_unknown_tags:
344 if allow_tags:
345 raise ValueError(
346 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
347 allow_tags = set(defs.tags)
348 if allow_tags:
349 bad = []
350 for el in doc.iter():
351 if el.tag not in allow_tags:
352 bad.append(el)
353 for el in bad:
354 el.drop_tag()
355 if self.add_nofollow:
356 for el in _find_external_links(doc):
357 if not self.allow_follow(el):
358 el.set('rel', 'nofollow')
359
361 """
362 Override to suppress rel="nofollow" on some anchors.
363 """
364 return False
365
367 if el.tag not in self._tag_link_attrs:
368 return False
369 attr = self._tag_link_attrs[el.tag]
370 if isinstance(attr, (list, tuple)):
371 for one_attr in attr:
372 url = el.get(one_attr)
373 if not url:
374 return False
375 if not self.allow_embedded_url(el, url):
376 return False
377 return True
378 else:
379 url = el.get(attr)
380 if not url:
381 return False
382 return self.allow_embedded_url(el, url)
383
385 if (self.whitelist_tags is not None
386 and el.tag not in self.whitelist_tags):
387 return False
388 scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
389 netloc = netloc.lower().split(':', 1)[0]
390 if scheme not in ('http', 'https'):
391 return False
392 if netloc in self.host_whitelist:
393 return True
394 return False
395
406
408 bad = []
409 for el in doc.iter(iterate):
410 if condition(el):
411 bad.append(el)
412 for el in bad:
413 el.drop_tree()
414
416
417 new = _substitute_whitespace('', link)
418 if _javascript_scheme_re.search(new):
419
420 return ''
421 return link
422
423 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
424
426 """
427 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
428 can get interpreted, or ``expre/* stuff */ssion(...)``. This
429 checks for attempt to do stuff like this.
430
431 Typically the response will be to kill the entire style; if you
432 have just a bit of Javascript in the style another rule will catch
433 that and remove only the Javascript from the style; this catches
434 more sneaky attempts.
435 """
436 style = self._substitute_comments('', style)
437 style = style.replace('\\', '')
438 style = _substitute_whitespace('', style)
439 style = style.lower()
440 if 'javascript:' in style:
441 return True
442 if 'expression(' in style:
443 return True
444 return False
445
447 if isinstance(html, basestring):
448 return_string = True
449 doc = fromstring(html)
450 else:
451 return_string = False
452 doc = copy.deepcopy(html)
453 self(doc)
454 if return_string:
455 return tostring(doc)
456 else:
457 return doc
458
459 clean = Cleaner()
460 clean_html = clean.clean_html
461
462
463
464
465
466 _link_regexes = [
467 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
468
469 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
470 ]
471
472 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
473
474 _avoid_hosts = [
475 re.compile(r'^localhost', re.I),
476 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
477 re.compile(r'^127\.0\.0\.1$'),
478 ]
479
480 _avoid_classes = ['nolink']
481
486 """
487 Turn any URLs into links.
488
489 It will search for links identified by the given regular
490 expressions (by default mailto and http(s) links).
491
492 It won't link text in an element in avoid_elements, or an element
493 with a class in avoid_classes. It won't link to anything with a
494 host that matches one of the regular expressions in avoid_hosts
495 (default localhost and 127.0.0.1).
496
497 If you pass in an element, the elements tail will not be
498 substituted, only the contents of the element.
499 """
500 if el.tag in avoid_elements:
501 return
502 class_name = el.get('class')
503 if class_name:
504 class_name = class_name.split()
505 for match_class in avoid_classes:
506 if match_class in class_name:
507 return
508 for child in list(el):
509 autolink(child, link_regexes=link_regexes,
510 avoid_elements=avoid_elements,
511 avoid_hosts=avoid_hosts,
512 avoid_classes=avoid_classes)
513 if child.tail:
514 text, tail_children = _link_text(
515 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
516 if tail_children:
517 child.tail = text
518 index = el.index(child)
519 el[index+1:index+1] = tail_children
520 if el.text:
521 text, pre_children = _link_text(
522 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
523 if pre_children:
524 el.text = text
525 el[:0] = pre_children
526
527 -def _link_text(text, link_regexes, avoid_hosts, factory):
528 leading_text = ''
529 links = []
530 last_pos = 0
531 while 1:
532 best_match, best_pos = None, None
533 for regex in link_regexes:
534 regex_pos = last_pos
535 while 1:
536 match = regex.search(text, pos=regex_pos)
537 if match is None:
538 break
539 host = match.group('host')
540 for host_regex in avoid_hosts:
541 if host_regex.search(host):
542 regex_pos = match.end()
543 break
544 else:
545 break
546 if match is None:
547 continue
548 if best_pos is None or match.start() < best_pos:
549 best_match = match
550 best_pos = match.start()
551 if best_match is None:
552
553 if links:
554 assert not links[-1].tail
555 links[-1].tail = text
556 else:
557 assert not leading_text
558 leading_text = text
559 break
560 link = best_match.group(0)
561 end = best_match.end()
562 if link.endswith('.') or link.endswith(','):
563
564 end -= 1
565 link = link[:-1]
566 prev_text = text[:best_match.start()]
567 if links:
568 assert not links[-1].tail
569 links[-1].tail = prev_text
570 else:
571 assert not leading_text
572 leading_text = prev_text
573 anchor = factory('a')
574 anchor.set('href', link)
575 body = best_match.group('body')
576 if not body:
577 body = link
578 if body.endswith('.') or body.endswith(','):
579 body = body[:-1]
580 anchor.text = body
581 links.append(anchor)
582 text = text[end:]
583 return leading_text, links
584
586 if isinstance(html, basestring):
587 doc = fromstring(html)
588 return_string = True
589 else:
590 doc = copy.deepcopy(html)
591 return_string = False
592 autolink(doc, *args, **kw)
593 if return_string:
594 return tostring(doc)
595 else:
596 return doc
597
598 autolink_html.__doc__ = autolink.__doc__
599
600
601
602
603
604 _avoid_word_break_elements = ['pre', 'textarea', 'code']
605 _avoid_word_break_classes = ['nobreak']
606
611 """
612 Breaks any long words found in the body of the text (not attributes).
613
614 Doesn't effect any of the tags in avoid_elements, by default
615 ``<textarea>`` and ``<pre>``
616
617 Breaks words by inserting ​, which is a unicode character
618 for Zero Width Space character. This generally takes up no space
619 in rendering, but does copy as a space, and in monospace contexts
620 usually takes up space.
621
622 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
623 """
624
625
626 if el.tag in _avoid_word_break_elements:
627 return
628 class_name = el.get('class')
629 if class_name:
630 dont_break = False
631 class_name = class_name.split()
632 for avoid in avoid_classes:
633 if avoid in class_name:
634 dont_break = True
635 break
636 if dont_break:
637 return
638 if el.text:
639 el.text = _break_text(el.text, max_width, break_character)
640 for child in el:
641 word_break(child, max_width=max_width,
642 avoid_elements=avoid_elements,
643 avoid_classes=avoid_classes,
644 break_character=break_character)
645 if child.tail:
646 child.tail = _break_text(child.tail, max_width, break_character)
647
652
653 -def _break_text(text, max_width, break_character):
654 words = text.split()
655 for word in words:
656 if len(word) > max_width:
657 replacement = _insert_break(word, max_width, break_character)
658 text = text.replace(word, replacement)
659 return text
660
661 _break_prefer_re = re.compile(r'[^a-z]', re.I)
662
664 orig_word = word
665 result = ''
666 while len(word) > width:
667 start = word[:width]
668 breaks = list(_break_prefer_re.finditer(start))
669 if breaks:
670 last_break = breaks[-1]
671
672 if last_break.end() > width-10:
673
674
675 start = word[:last_break.end()]
676 result += start + break_character
677 word = word[len(start):]
678 result += word
679 return result
680