1 import re
2 import copy
3 import urlparse
4 from lxml import etree
5 from lxml.html import defs
6 from lxml.html import fromstring, tostring
7
8 try:
9 set
10 except NameError:
11 from sets import Set as set
12
13 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
14 'word_break', 'word_break_html']
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37 _css_javascript_re = re.compile(
38 r'expression\s*\(.*?\)', re.S|re.I)
39
40
41 _css_import_re = re.compile(
42 r'@\s*import', re.I)
43
44
45
46 _javascript_scheme_re = re.compile(
47 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
48 _substitute_whitespace = re.compile(r'\s+').sub
49
50
51
52 _conditional_comment_re = re.compile(
53 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
54
55 _find_styled_elements = etree.XPath(
56 "descendant-or-self::*[@style]")
57
58 _find_external_links = etree.XPath(
59 "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
60
62 """
63 Instances cleans the document of each of the possible offending
64 elements. The cleaning is controlled by attributes; you can
65 override attributes in a subclass, or set them in the constructor.
66
67 ``scripts``:
68 Removes any ``<script>`` tags.
69
70 ``javascript``:
71 Removes any Javascript, like an ``onclick`` attribute.
72
73 ``comments``:
74 Removes any comments.
75
76 ``style``:
77 Removes any style tags or attributes.
78
79 ``links``:
80 Removes any ``<link>`` tags
81
82 ``meta``:
83 Removes any ``<meta>`` tags
84
85 ``page_structure``:
86 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
87
88 ``processing_instructions``:
89 Removes any processing instructions.
90
91 ``embedded``:
92 Removes any embedded objects (flash, iframes)
93
94 ``frames``:
95 Removes any frame-related tags
96
97 ``forms``:
98 Removes any form tags
99
100 ``annoying_tags``:
101 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
102
103 ``remove_tags``:
104 A list of tags to remove.
105
106 ``allow_tags``:
107 A list of tags to include (default include all).
108
109 ``remove_unknown_tags``:
110 Remove any tags that aren't standard parts of HTML.
111
112 ``safe_attrs_only``:
113 If true, only include 'safe' attributes (specifically the list
114 from `feedparser
115 <http://feedparser.org/docs/html-sanitization.html>`_).
116
117 ``add_nofollow``:
118 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
119
120 ``host_whitelist``:
121 A list or set of hosts that you can use for embedded content
122 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
123 You can also implement/override the method
124 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
125 implement more complex rules for what can be embedded.
126 Anything that passes this test will be shown, regardless of
127 the value of (for instance) ``embedded``.
128
129 Note that this parameter might not work as intended if you do not
130 make the links absolute before doing the cleaning.
131
132 ``whitelist_tags``:
133 A set of tags that can be included with ``host_whitelist``.
134 The default is ``iframe`` and ``embed``; you may wish to
135 include other tags like ``script``, or you may want to
136 implement ``allow_embedded_url`` for more control. Set to None to
137 include all tags.
138
139 This modifies the document *in place*.
140 """
141
142 scripts = True
143 javascript = True
144 comments = True
145 style = False
146 links = True
147 meta = True
148 page_structure = True
149 processing_instructions = True
150 embedded = True
151 frames = True
152 forms = True
153 annoying_tags = True
154 remove_tags = None
155 allow_tags = None
156 remove_unknown_tags = True
157 safe_attrs_only = True
158 add_nofollow = False
159 host_whitelist = ()
160 whitelist_tags = set(['iframe', 'embed'])
161
168
169
170
171 _tag_link_attrs = dict(
172 script='src',
173 link='href',
174
175
176 applet=['code', 'object'],
177 iframe='src',
178 embed='src',
179 layer='src',
180
181
182
183
184
185
186
187
188 a='href',
189 )
190
192 """
193 Cleans the document.
194 """
195 if hasattr(doc, 'getroot'):
196
197 doc = doc.getroot()
198
199
200 for el in doc.iter('image'):
201 el.tag = 'img'
202 if not self.comments:
203
204
205 self.kill_conditional_comments(doc)
206 kill_tags = set()
207 remove_tags = set(self.remove_tags or ())
208 if self.allow_tags:
209 allow_tags = set(self.allow_tags)
210 else:
211 allow_tags = set()
212 if self.scripts:
213 kill_tags.add('script')
214 if self.safe_attrs_only:
215 safe_attrs = set(defs.safe_attrs)
216 for el in doc.iter():
217 attrib = el.attrib
218 for aname in attrib.keys():
219 if aname not in safe_attrs:
220 del attrib[aname]
221 if self.javascript:
222 if not self.safe_attrs_only:
223
224 for el in doc.iter():
225 attrib = el.attrib
226 for aname in attrib.keys():
227 if aname.startswith('on'):
228 del attrib[aname]
229 doc.rewrite_links(self._remove_javascript_link,
230 resolve_base_href=False)
231 if not self.style:
232
233
234 for el in _find_styled_elements(doc):
235 old = el.get('style')
236 new = _css_javascript_re.sub('', old)
237 new = _css_import_re.sub('', old)
238 if self._has_sneaky_javascript(new):
239
240 del el.attrib['style']
241 elif new != old:
242 el.set('style', new)
243 for el in list(doc.iter('style')):
244 if el.get('type', '').lower().strip() == 'text/javascript':
245 el.drop_tree()
246 continue
247 old = el.text or ''
248 new = _css_javascript_re.sub('', old)
249
250 new = _css_import_re.sub('', old)
251 if self._has_sneaky_javascript(new):
252
253 el.text = '/* deleted */'
254 elif new != old:
255 el.text = new
256 if self.comments or self.processing_instructions:
257
258
259
260 kill_tags.add(etree.Comment)
261 if self.processing_instructions:
262 kill_tags.add(etree.ProcessingInstruction)
263 if self.style:
264 kill_tags.add('style')
265 for el in _find_styled_elements(doc):
266 del el.attrib['style']
267 if self.links:
268 kill_tags.add('link')
269 elif self.style or self.javascript:
270
271
272 for el in list(doc.iter('link')):
273 if 'stylesheet' in el.get('rel', '').lower():
274
275 el.drop_tree()
276 if self.meta:
277 kill_tags.add('meta')
278 if self.page_structure:
279 remove_tags.update(('head', 'html', 'title'))
280 if self.embedded:
281
282
283
284 for el in list(doc.iter('param')):
285 found_parent = False
286 parent = el.getparent()
287 while parent is not None and parent.tag not in ('applet', 'object'):
288 parent = parent.getparent()
289 if parent is None:
290 el.drop_tree()
291 kill_tags.update(('applet',))
292
293 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
294 if self.frames:
295
296
297
298 kill_tags.update(defs.frame_tags)
299 if self.forms:
300 remove_tags.add('form')
301 kill_tags.update(('button', 'input', 'select', 'textarea'))
302 if self.annoying_tags:
303 remove_tags.update(('blink', 'marque'))
304
305 _remove = []
306 _kill = []
307 for el in doc.iter():
308 if el.tag in kill_tags:
309 if self.allow_element(el):
310 continue
311 _kill.append(el)
312 elif el.tag in remove_tags:
313 if self.allow_element(el):
314 continue
315 _remove.append(el)
316
317 if _remove and _remove[0] == doc:
318
319
320 el = _remove.pop(0)
321 el.tag = 'div'
322 el.attrib.clear()
323 elif _kill and _kill[0] == doc:
324
325
326 el = _kill.pop(0)
327 if el.tag != 'html':
328 el.tag = 'div'
329 el.clear()
330
331 for el in _kill:
332 el.drop_tree()
333 for el in _remove:
334 el.drop_tag()
335
336 allow_tags = self.allow_tags
337 if self.remove_unknown_tags:
338 if allow_tags:
339 raise ValueError(
340 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
341 allow_tags = set(defs.tags)
342 if allow_tags:
343 bad = []
344 for el in doc.iter():
345 if el.tag not in allow_tags:
346 bad.append(el)
347 for el in bad:
348 el.drop_tag()
349 if self.add_nofollow:
350 for el in _find_external_links(doc):
351 if not self.allow_follow(el):
352 el.set('rel', 'nofollow')
353
355 """
356 Override to suppress rel="nofollow" on some anchors.
357 """
358 return False
359
361 if el.tag not in self._tag_link_attrs:
362 return False
363 attr = self._tag_link_attrs[el.tag]
364 if isinstance(attr, (list, tuple)):
365 for one_attr in attr:
366 url = el.get(one_attr)
367 if not url:
368 return False
369 if not self.allow_embedded_url(el, url):
370 return False
371 return True
372 else:
373 url = el.get(attr)
374 if not url:
375 return False
376 return self.allow_embedded_url(el, url)
377
379 if (self.whitelist_tags is not None
380 and el.tag not in self.whitelist_tags):
381 return False
382 scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
383 netloc = netloc.lower().split(':', 1)[0]
384 if scheme not in ('http', 'https'):
385 return False
386 if netloc in self.host_whitelist:
387 return True
388 return False
389
400
402 bad = []
403 for el in doc.iter(iterate):
404 if condition(el):
405 bad.append(el)
406 for el in bad:
407 el.drop_tree()
408
416
417 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
418
420 """
421 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
422 can get interpreted, or ``expre/* stuff */ssion(...)``. This
423 checks for attempt to do stuff like this.
424
425 Typically the response will be to kill the entire style; if you
426 have just a bit of Javascript in the style another rule will catch
427 that and remove only the Javascript from the style; this catches
428 more sneaky attempts.
429 """
430 style = self._substitute_comments('', style)
431 style = style.replace('\\', '')
432 style = _substitute_whitespace('', style)
433 style = style.lower()
434 if 'javascript:' in style:
435 return True
436 if 'expression(' in style:
437 return True
438 return False
439
441 if isinstance(html, basestring):
442 return_string = True
443 doc = fromstring(html)
444 else:
445 return_string = False
446 doc = copy.deepcopy(html)
447 self(doc)
448 if return_string:
449 return tostring(doc)
450 else:
451 return doc
452
453 clean = Cleaner()
454 clean_html = clean.clean_html
455
456
457
458
459
460 _link_regexes = [
461 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
462
463 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
464 ]
465
466 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
467
468 _avoid_hosts = [
469 re.compile(r'^localhost', re.I),
470 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
471 re.compile(r'^127\.0\.0\.1$'),
472 ]
473
474 _avoid_classes = ['nolink']
475
480 """
481 Turn any URLs into links.
482
483 It will search for links identified by the given regular
484 expressions (by default mailto and http(s) links).
485
486 It won't link text in an element in avoid_elements, or an element
487 with a class in avoid_classes. It won't link to anything with a
488 host that matches one of the regular expressions in avoid_hosts
489 (default localhost and 127.0.0.1).
490
491 If you pass in an element, the elements tail will not be
492 substituted, only the contents of the element.
493 """
494 if el.tag in avoid_elements:
495 return
496 class_name = el.get('class')
497 if class_name:
498 class_name = class_name.split()
499 for match_class in avoid_classes:
500 if match_class in class_name:
501 return
502 for child in list(el):
503 autolink(child, link_regexes=link_regexes,
504 avoid_elements=avoid_elements,
505 avoid_hosts=avoid_hosts,
506 avoid_classes=avoid_classes)
507 if child.tail:
508 text, tail_children = _link_text(
509 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
510 if tail_children:
511 child.tail = text
512 index = el.index(child)
513 el[index+1:index+1] = tail_children
514 if el.text:
515 text, pre_children = _link_text(
516 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
517 if pre_children:
518 el.text = text
519 el[:0] = pre_children
520
521 -def _link_text(text, link_regexes, avoid_hosts, factory):
522 leading_text = ''
523 links = []
524 last_pos = 0
525 while 1:
526 best_match, best_pos = None, None
527 for regex in link_regexes:
528 regex_pos = last_pos
529 while 1:
530 match = regex.search(text, pos=regex_pos)
531 if match is None:
532 break
533 host = match.group('host')
534 for host_regex in avoid_hosts:
535 if host_regex.search(host):
536 regex_pos = match.end()
537 break
538 else:
539 break
540 if match is None:
541 continue
542 if best_pos is None or match.start() < best_pos:
543 best_match = match
544 best_pos = match.start()
545 if best_match is None:
546
547 if links:
548 assert not links[-1].tail
549 links[-1].tail = text
550 else:
551 assert not leading_text
552 leading_text = text
553 break
554 link = best_match.group(0)
555 end = best_match.end()
556 if link.endswith('.') or link.endswith(','):
557
558 end -= 1
559 link = link[:-1]
560 prev_text = text[:best_match.start()]
561 if links:
562 assert not links[-1].tail
563 links[-1].tail = prev_text
564 else:
565 assert not leading_text
566 leading_text = prev_text
567 anchor = factory('a')
568 anchor.set('href', link)
569 body = best_match.group('body')
570 if not body:
571 body = link
572 if body.endswith('.') or body.endswith(','):
573 body = body[:-1]
574 anchor.text = body
575 links.append(anchor)
576 text = text[end:]
577 return leading_text, links
578
580 if isinstance(html, basestring):
581 doc = fromstring(html)
582 return_string = True
583 else:
584 doc = copy.deepcopy(html)
585 return_string = False
586 autolink(doc, *args, **kw)
587 if return_string:
588 return tostring(doc)
589 else:
590 return doc
591
592 autolink_html.__doc__ = autolink.__doc__
593
594
595
596
597
598 _avoid_word_break_elements = ['pre', 'textarea', 'code']
599 _avoid_word_break_classes = ['nobreak']
600
605 """
606 Breaks any long words found in the body of the text (not attributes).
607
608 Doesn't effect any of the tags in avoid_elements, by default
609 ``<textarea>`` and ``<pre>``
610
611 Breaks words by inserting ​, which is a unicode character
612 for Zero Width Space character. This generally takes up no space
613 in rendering, but does copy as a space, and in monospace contexts
614 usually takes up space.
615
616 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
617 """
618
619
620 if el.tag in _avoid_word_break_elements:
621 return
622 class_name = el.get('class')
623 if class_name:
624 dont_break = False
625 class_name = class_name.split()
626 for avoid in avoid_classes:
627 if avoid in class_name:
628 dont_break = True
629 break
630 if dont_break:
631 return
632 if el.text:
633 el.text = _break_text(el.text, max_width, break_character)
634 for child in el:
635 word_break(child, max_width=max_width,
636 avoid_elements=avoid_elements,
637 avoid_classes=avoid_classes,
638 break_character=break_character)
639 if child.tail:
640 child.tail = _break_text(child.tail, max_width, break_character)
641
646
647 -def _break_text(text, max_width, break_character):
648 words = text.split()
649 for word in words:
650 if len(word) > max_width:
651 replacement = _insert_break(word, max_width, break_character)
652 text = text.replace(word, replacement)
653 return text
654
655 _break_prefer_re = re.compile(r'[^a-z]', re.I)
656
658 orig_word = word
659 result = ''
660 while len(word) > width:
661 start = word[:width]
662 breaks = list(_break_prefer_re.finditer(start))
663 if breaks:
664 last_break = breaks[-1]
665
666 if last_break.end() > width-10:
667
668
669 start = word[:last_break.end()]
670 result += start + break_character
671 word = word[len(start):]
672 result += word
673 return result
674