1 import re
2 import urlparse
3 from lxml import etree
4 from lxml.html import defs
5 from lxml.html import fromstring, tostring
6
7 try:
8 set
9 except NameError:
10 from sets import Set as set
11
12 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
13 'word_break', 'word_break_html']
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 _css_javascript_re = re.compile(
37 r'expression\s*\(.*?\)', re.S|re.I)
38
39
40 _css_import_re = re.compile(
41 r'@\s*import', re.I)
42
43
44
45 _javascript_scheme_re = re.compile(
46 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
47 _whitespace_re = re.compile(r'\s+')
48
49
50
51 _conditional_comment_re = re.compile(
52 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
53
54 _find_styled_elements = etree.XPath(
55 "descendant-or-self::*[@style]")
56
57 _find_external_links = etree.XPath(
58 "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
59
61 """
62 Like clean(), but takes a text input document, and returns a text
63 document.
64 """
65 doc = fromstring(html)
66 clean(doc, **kw)
67 return tostring(doc)
68
70 """
71 Instances cleans the document of each of the possible offending
72 elements. The cleaning is controlled by attributes; you can
73 override attributes in a subclass, or set them in the constructor.
74
75 ``scripts``:
76 Removes any ``<script>`` tags.
77
78 ``javascript``:
79 Removes any Javascript, like an ``onclick`` attribute.
80
81 ``comments``:
82 Removes any comments.
83
84 ``style``:
85 Removes any style tags or attributes.
86
87 ``links``:
88 Removes any ``<link>`` tags
89
90 ``meta``:
91 Removes any ``<meta>`` tags
92
93 ``page_structure``:
94 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
95
96 ``processing_instructions``:
97 Removes any processing instructions.
98
99 ``embedded``:
100 Removes any embedded objects (flash, iframes)
101
102 ``frames``:
103 Removes any frame-related tags
104
105 ``forms``:
106 Removes any form tags
107
108 ``annoying_tags``:
109 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
110
111 ``remove_tags``:
112 A list of tags to remove.
113
114 ``allow_tags``:
115 A list of tags to include (default include all).
116
117 ``remove_unknown_tags``:
118 Remove any tags that aren't standard parts of HTML.
119
120 ``safe_attrs_only``:
121 If true, only include 'safe' attributes (specifically the list
122 from `feedparser
123 <http://feedparser.org/docs/html-sanitization.html>`_).
124
125 ``add_nofollow``:
126 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
127
128 ``host_whitelist``:
129 A list or set of hosts that you can use for embedded content
130 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
131 You can also implement/override the method
132 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
133 implement more complex rules for what can be embedded.
134 Anything that passes this test will be shown, regardless of
135 the value of (for instance) ``embedded``.
136
137 Note that this parameter might not work as intended if you do not
138 make the links absolute before doing the cleaning.
139
140 ``whitelist_tags``:
141 A set of tags that can be included with ``host_whitelist``.
142 The default is ``iframe`` and ``embed``; you may wish to
143 include other tags like ``script``, or you may want to
144 implement ``allow_embedded_url`` for more control. Set to None to
145 include all tags.
146
147 This modifies the document *in place*.
148 """
149
150 scripts = True
151 javascript = True
152 comments = True
153 style = False
154 links = True
155 meta = True
156 page_structure = True
157 processing_instructions = True
158 embedded = True
159 frames = True
160 forms = True
161 annoying_tags = True
162 remove_tags = None
163 allow_tags = None
164 remove_unknown_tags = True
165 safe_attrs_only = True
166 add_nofollow = False
167 host_whitelist = ()
168 whitelist_tags = set(['iframe', 'embed'])
169
176
177
178
179 _tag_link_attrs = dict(
180 script='src',
181 link='href',
182
183
184 applet=['code', 'object'],
185 iframe='src',
186 embed='src',
187 layer='src',
188
189
190
191
192
193
194
195
196 a='href',
197 )
198
200 """
201 Cleans the document.
202 """
203 if hasattr(doc, 'getroot'):
204
205 doc = doc.getroot()
206
207
208 for el in doc.getiterator('image'):
209 el.tag = 'img'
210 if not self.comments:
211
212
213 self.kill_conditional_comments(doc)
214 kill_tags = set()
215 remove_tags = set(self.remove_tags or ())
216 if self.allow_tags:
217 allow_tags = set(self.allow_tags)
218 else:
219 allow_tags = set()
220 if self.scripts:
221 kill_tags.add('script')
222 if self.safe_attrs_only:
223 safe_attrs = set(defs.safe_attrs)
224 for el in doc.getiterator():
225 attrib = el.attrib
226 for aname in attrib.keys():
227 if aname not in safe_attrs:
228 del attrib[aname]
229 if self.javascript:
230 if not self.safe_attrs_only:
231
232 for el in doc.getiterator():
233 attrib = el.attrib
234 for aname in attrib.keys():
235 if aname.startswith('on'):
236 del attrib[aname]
237 doc.rewrite_links(self._remove_javascript_link,
238 resolve_base_href=False)
239 if not self.style:
240
241
242 for el in _find_styled_elements(doc):
243 old = el.get('style')
244 new = _css_javascript_re.sub('', old)
245 new = _css_import_re.sub('', old)
246 if self._has_sneaky_javascript(new):
247
248 del el.attrib['style']
249 elif new != old:
250 el.set('style', new)
251 for el in list(doc.getiterator('style')):
252 if el.get('type', '').lower().strip() == 'text/javascript':
253 el.drop_tree()
254 continue
255 old = el.text or ''
256 new = _css_javascript_re.sub('', old)
257
258 new = _css_import_re.sub('', old)
259 if self._has_sneaky_javascript(new):
260
261 el.text = '/* deleted */'
262 elif new != old:
263 el.text = new
264 if self.comments or self.processing_instructions:
265
266
267
268 kill_tags.add(etree.Comment)
269 if self.processing_instructions:
270 kill_tags.add(etree.ProcessingInstruction)
271 if self.style:
272 kill_tags.add('style')
273 for el in _find_styled_elements(doc):
274 del el.attrib['style']
275 if self.links:
276 kill_tags.add('link')
277 elif self.style or self.javascript:
278
279
280 for el in list(doc.getiterator('link')):
281 if 'stylesheet' in el.get('rel', '').lower():
282
283 el.drop_tree()
284 if self.meta:
285 kill_tags.add('meta')
286 if self.page_structure:
287 remove_tags.update(('head', 'html', 'title'))
288 if self.embedded:
289
290
291
292 for el in list(doc.getiterator('param')):
293 found_parent = False
294 parent = el.getparent()
295 while parent is not None and parent.tag not in ('applet', 'object'):
296 parent = parent.getparent()
297 if parent is None:
298 el.drop_tree()
299 kill_tags.update(('applet',))
300
301 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
302 if self.frames:
303
304
305
306 kill_tags.update(defs.frame_tags)
307 if self.forms:
308 remove_tags.add('form')
309 kill_tags.update(('button', 'input', 'select', 'textarea'))
310 if self.annoying_tags:
311 remove_tags.update(('blink', 'marque'))
312
313 _remove = []
314 _kill = []
315 for el in doc.getiterator():
316 if el.tag in kill_tags:
317 if self.allow_element(el):
318 continue
319 _kill.append(el)
320 elif el.tag in remove_tags:
321 if self.allow_element(el):
322 continue
323 _remove.append(el)
324
325 if _remove and _remove[0] == doc:
326
327
328 el = _remove.pop(0)
329 el.tag = 'div'
330 el.attrib.clear()
331 elif _kill and _kill[0] == doc:
332
333
334 el = _kill.pop(0)
335 if el.tag != 'html':
336 el.tag = 'div'
337 el.clear()
338
339 for el in _kill:
340 el.drop_tree()
341 for el in _remove:
342 el.drop_tag()
343
344 allow_tags = self.allow_tags
345 if self.remove_unknown_tags:
346 if allow_tags:
347 raise ValueError(
348 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
349 allow_tags = set(defs.tags)
350 if allow_tags:
351 bad = []
352 for el in doc.getiterator():
353 if el.tag not in allow_tags:
354 bad.append(el)
355 for el in bad:
356 el.drop_tag()
357 if self.add_nofollow:
358 for el in _find_external_links(doc):
359 if not self.allow_follow(el):
360 el.set('rel', 'nofollow')
361
363 """
364 Override to suppress rel="nofollow" on some anchors.
365 """
366 return False
367
369 if el.tag not in self._tag_link_attrs:
370 return False
371 attr = self._tag_link_attrs[el.tag]
372 if isinstance(attr, (list, tuple)):
373 for one_attr in attr:
374 url = el.get(one_attr)
375 if not url:
376 return False
377 if not self.allow_embedded_url(el, url):
378 return False
379 return True
380 else:
381 url = el.get(attr)
382 if not url:
383 return False
384 return self.allow_embedded_url(el, url)
385
387 if (self.whitelist_tags is not None
388 and el.tag not in self.whitelist_tags):
389 return False
390 scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
391 netloc = netloc.lower().split(':', 1)[0]
392 if scheme not in ('http', 'https'):
393 return False
394 if netloc in self.host_whitelist:
395 return True
396 return False
397
408
410 bad = []
411 for el in doc.getiterator(iterate):
412 if condition(el):
413 bad.append(el)
414 for el in bad:
415 el.drop_tree()
416
424
425 _decomment_re = re.compile(r'/\*.*?\*/', re.S)
426
428 """
429 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
430 can get interpreted, or ``expre/* stuff */ssion(...)``. This
431 checks for attempt to do stuff like this.
432
433 Typically the response will be to kill the entire style; if you
434 have just a bit of Javascript in the style another rule will catch
435 that and remove only the Javascript from the style; this catches
436 more sneaky attempts.
437 """
438 style = self._decomment_re.sub('', style)
439 style = style.replace('\\', '')
440 style = _whitespace_re.sub('', style)
441 style = style.lower()
442 if 'javascript:' in style:
443 return True
444 if 'expression(' in style:
445 return True
446 return False
447
449 if isinstance(html, basestring):
450 return_string = True
451 doc = fromstring(html)
452 else:
453 return_string = False
454 doc = copy.deepcopy(html)
455 self(doc)
456 if return_string:
457 return tostring(doc)
458 else:
459 return doc
460
461 clean = Cleaner()
462 clean_html = clean.clean_html
463
464
465
466
467
468 _link_regexes = [
469 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
470
471 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
472 ]
473
474 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
475
476 _avoid_hosts = [
477 re.compile(r'^localhost', re.I),
478 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
479 re.compile(r'^127\.0\.0\.1$'),
480 ]
481
482 _avoid_classes = ['nolink']
483
488 """
489 Turn any URLs into links.
490
491 It will search for links identified by the given regular
492 expressions (by default mailto and http(s) links).
493
494 It won't link text in an element in avoid_elements, or an element
495 with a class in avoid_classes. It won't link to anything with a
496 host that matches one of the regular expressions in avoid_hosts
497 (default localhost and 127.0.0.1).
498
499 If you pass in an element, the elements tail will not be
500 substituted, only the contents of the element.
501 """
502 if el.tag in avoid_elements:
503 return
504 class_name = el.get('class')
505 if class_name:
506 class_name = class_name.split()
507 for match_class in avoid_classes:
508 if match_class in class_name:
509 return
510 for child in list(el):
511 autolink(child, link_regexes=link_regexes,
512 avoid_elements=avoid_elements,
513 avoid_hosts=avoid_hosts,
514 avoid_classes=avoid_classes)
515 if child.tail:
516 text, tail_children = _link_text(
517 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
518 if tail_children:
519 child.tail = text
520 index = el.index(child)
521 el[index+1:index+1] = tail_children
522 if el.text:
523 text, pre_children = _link_text(
524 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
525 if pre_children:
526 el.text = text
527 el[:0] = pre_children
528
529 -def _link_text(text, link_regexes, avoid_hosts, factory):
530 leading_text = ''
531 links = []
532 last_pos = 0
533 while 1:
534 best_match, best_pos = None, None
535 for regex in link_regexes:
536 regex_pos = last_pos
537 while 1:
538 match = regex.search(text, pos=regex_pos)
539 if match is None:
540 break
541 host = match.group('host')
542 for host_regex in avoid_hosts:
543 if host_regex.search(host):
544 regex_pos = match.end()
545 break
546 else:
547 break
548 if match is None:
549 continue
550 if best_pos is None or match.start() < best_pos:
551 best_match = match
552 best_pos = match.start()
553 if best_match is None:
554
555 if links:
556 assert not links[-1].tail
557 links[-1].tail = text
558 else:
559 assert not leading_text
560 leading_text = text
561 break
562 link = best_match.group(0)
563 end = best_match.end()
564 if link.endswith('.') or link.endswith(','):
565
566 end -= 1
567 link = link[:-1]
568 prev_text = text[:best_match.start()]
569 if links:
570 assert not links[-1].tail
571 links[-1].tail = prev_text
572 else:
573 assert not leading_text
574 leading_text = prev_text
575 anchor = factory('a')
576 anchor.set('href', link)
577 body = best_match.group('body')
578 if not body:
579 body = link
580 if body.endswith('.') or body.endswith(','):
581 body = body[:-1]
582 anchor.text = body
583 links.append(anchor)
584 text = text[end:]
585 return leading_text, links
586
588 if isinstance(html, basestring):
589 doc = fromstring(html)
590 return_string = True
591 else:
592 doc = copy.deepcopy(html)
593 return_string = False
594 autolink(doc, *args, **kw)
595 if return_string:
596 return tostring(doc)
597 else:
598 return doc
599
600 autolink_html.__doc__ = autolink.__doc__
601
602
603
604
605
606 _avoid_word_break_elements = ['pre', 'textarea', 'code']
607 _avoid_word_break_classes = ['nobreak']
608
613 """
614 Breaks any long words found in the body of the text (not attributes).
615
616 Doesn't effect any of the tags in avoid_elements, by default
617 ``<textarea>`` and ``<pre>``
618
619 Breaks words by inserting ​, which is a unicode character
620 for Zero Width Space character. This generally takes up no space
621 in rendering, but does copy as a space, and in monospace contexts
622 usually takes up space.
623
624 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
625 """
626
627
628 if el.tag in _avoid_word_break_elements:
629 return
630 class_name = el.get('class')
631 if class_name:
632 dont_break = False
633 class_name = class_name.split()
634 for avoid in avoid_classes:
635 if avoid in class_name:
636 dont_break = True
637 break
638 if dont_break:
639 return
640 if el.text:
641 el.text = _break_text(el.text, max_width, break_character)
642 for child in el:
643 word_break(child, max_width=max_width,
644 avoid_elements=avoid_elements,
645 avoid_classes=avoid_classes,
646 break_character=break_character)
647 if child.tail:
648 child.tail = _break_text(child.tail, max_width, break_character)
649
654
655 -def _break_text(text, max_width, break_character):
656 words = text.split()
657 for word in words:
658 if len(word) > max_width:
659 replacement = _insert_break(word, max_width, break_character)
660 text = text.replace(word, replacement)
661 return text
662
663 _break_prefer_re = re.compile(r'[^a-z]', re.I)
664
666 orig_word = word
667 result = ''
668 while len(word) > width:
669 start = word[:width]
670 breaks = list(_break_prefer_re.finditer(start))
671 if breaks:
672 last_break = breaks[-1]
673
674 if last_break.end() > width-10:
675
676
677 start = word[:last_break.end()]
678 result += start + break_character
679 word = word[len(start):]
680 result += word
681 return result
682