1
2
3 from __future__ import absolute_import
4
5 import difflib
6 from lxml import etree
7 from lxml.html import fragment_fromstring
8 import re
9
10 __all__ = ['html_annotate', 'htmldiff']
11
12 try:
13 from html import escape as html_escape
14 except ImportError:
15 from cgi import escape as html_escape
16 try:
17 _unicode = unicode
18 except NameError:
19
20 _unicode = str
21 try:
22 basestring
23 except NameError:
24
25 basestring = str
26
27
28
29
30
32 return '<span title="%s">%s</span>' % (
33 html_escape(_unicode(version), 1), text)
34
36 """
37 doclist should be ordered from oldest to newest, like::
38
39 >>> version1 = 'Hello World'
40 >>> version2 = 'Goodbye World'
41 >>> print(html_annotate([(version1, 'version 1'),
42 ... (version2, 'version 2')]))
43 <span title="version 2">Goodbye</span> <span title="version 1">World</span>
44
45 The documents must be *fragments* (str/UTF8 or unicode), not
46 complete documents
47
48 The markup argument is a function to markup the spans of words.
49 This function is called like markup('Hello', 'version 2'), and
50 returns HTML. The first argument is text and never includes any
51 markup. The default uses a span with a title:
52
53 >>> print(default_markup('Some Text', 'by Joe'))
54 <span title="by Joe">Some Text</span>
55 """
56
57
58
59
60
61 tokenlist = [tokenize_annotated(doc, version)
62 for doc, version in doclist]
63 cur_tokens = tokenlist[0]
64 for tokens in tokenlist[1:]:
65 html_annotate_merge_annotations(cur_tokens, tokens)
66 cur_tokens = tokens
67
68
69
70 cur_tokens = compress_tokens(cur_tokens)
71
72 result = markup_serialize_tokens(cur_tokens, markup)
73 return ''.join(result).strip()
74
76 """Tokenize a document and add an annotation attribute to each token
77 """
78 tokens = tokenize(doc, include_hrefs=False)
79 for tok in tokens:
80 tok.annotation = annotation
81 return tokens
82
84 """Merge the annotations from tokens_old into tokens_new, when the
85 tokens in the new document already existed in the old document.
86 """
87 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
88 commands = s.get_opcodes()
89
90 for command, i1, i2, j1, j2 in commands:
91 if command == 'equal':
92 eq_old = tokens_old[i1:i2]
93 eq_new = tokens_new[j1:j2]
94 copy_annotations(eq_old, eq_new)
95
97 """
98 Copy annotations from the tokens listed in src to the tokens in dest
99 """
100 assert len(src) == len(dest)
101 for src_tok, dest_tok in zip(src, dest):
102 dest_tok.annotation = src_tok.annotation
103
105 """
106 Combine adjacent tokens when there is no HTML between the tokens,
107 and they share an annotation
108 """
109 result = [tokens[0]]
110 for tok in tokens[1:]:
111 if (not result[-1].post_tags and
112 not tok.pre_tags and
113 result[-1].annotation == tok.annotation):
114 compress_merge_back(result, tok)
115 else:
116 result.append(tok)
117 return result
118
120 """ Merge tok into the last element of tokens (modifying the list of
121 tokens in-place). """
122 last = tokens[-1]
123 if type(last) is not token or type(tok) is not token:
124 tokens.append(tok)
125 else:
126 text = _unicode(last)
127 if last.trailing_whitespace:
128 text += last.trailing_whitespace
129 text += tok
130 merged = token(text,
131 pre_tags=last.pre_tags,
132 post_tags=tok.post_tags,
133 trailing_whitespace=tok.trailing_whitespace)
134 merged.annotation = last.annotation
135 tokens[-1] = merged
136
138 """
139 Serialize the list of tokens into a list of text chunks, calling
140 markup_func around text to add annotations.
141 """
142 for token in tokens:
143 for pre in token.pre_tags:
144 yield pre
145 html = token.html()
146 html = markup_func(html, token.annotation)
147 if token.trailing_whitespace:
148 html += token.trailing_whitespace
149 yield html
150 for post in token.post_tags:
151 yield post
152
153
154
155
156
157
159
160
161 """ Do a diff of the old and new document. The documents are HTML
162 *fragments* (str/UTF8 or unicode), they are not complete documents
163 (i.e., no <html> tag).
164
165 Returns HTML with <ins> and <del> tags added around the
166 appropriate text.
167
168 Markup is generally ignored, with the markup from new_html
169 preserved, and possibly some markup from old_html (though it is
170 considered acceptable to lose some of the old markup). Only the
171 words in the HTML are diffed. The exception is <img> tags, which
172 are treated like words, and the href attribute of <a> tags, which
173 are noted inside the tag itself when there are changes.
174 """
175 old_html_tokens = tokenize(old_html)
176 new_html_tokens = tokenize(new_html)
177 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
178 result = ''.join(result).strip()
179 return fixup_ins_del_tags(result)
180
182 """ Does a diff on the tokens themselves, returning a list of text
183 chunks (not tokens).
184 """
185
186
187
188
189
190
191
192
193
194
195
196
197
198 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
199 commands = s.get_opcodes()
200 result = []
201 for command, i1, i2, j1, j2 in commands:
202 if command == 'equal':
203 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
204 continue
205 if command == 'insert' or command == 'replace':
206 ins_tokens = expand_tokens(html2_tokens[j1:j2])
207 merge_insert(ins_tokens, result)
208 if command == 'delete' or command == 'replace':
209 del_tokens = expand_tokens(html1_tokens[i1:i2])
210 merge_delete(del_tokens, result)
211
212
213
214
215 result = cleanup_delete(result)
216
217 return result
218
220 """Given a list of tokens, return a generator of the chunks of
221 text for the data in the tokens.
222 """
223 for token in tokens:
224 for pre in token.pre_tags:
225 yield pre
226 if not equal or not token.hide_when_equal:
227 if token.trailing_whitespace:
228 yield token.html() + token.trailing_whitespace
229 else:
230 yield token.html()
231 for post in token.post_tags:
232 yield post
233
235 """ doc is the already-handled document (as a list of text chunks);
236 here we add <ins>ins_chunks</ins> to the end of that. """
237
238
239
240 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
241 doc.extend(unbalanced_start)
242 if doc and not doc[-1].endswith(' '):
243
244
245 doc[-1] += ' '
246 doc.append('<ins>')
247 if balanced and balanced[-1].endswith(' '):
248
249 balanced[-1] = balanced[-1][:-1]
250 doc.extend(balanced)
251 doc.append('</ins> ')
252 doc.extend(unbalanced_end)
253
254
255
256
261
263 """ Raised when the document no longer contains any pending deletes
264 (DEL_START/DEL_END) """
265
267 """ Adds the text chunks in del_chunks to the document doc (another
268 list of text chunks) with marker to show it is a delete.
269 cleanup_delete later resolves these markers into <del> tags."""
270 doc.append(DEL_START)
271 doc.extend(del_chunks)
272 doc.append(DEL_END)
273
275 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
276 them with <del></del>. To do this while keeping the document
277 valid, it may need to drop some tags (either start or end tags).
278
279 It may also move the del into adjacent tags to try to move it to a
280 similar location where it was originally located (e.g., moving a
281 delete into preceding <div> tag, if the del looks like (DEL_START,
282 'Text</div>', DEL_END)"""
283 while 1:
284
285
286
287 try:
288 pre_delete, delete, post_delete = split_delete(chunks)
289 except NoDeletes:
290
291 break
292
293
294 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
295
296
297 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
298 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
299 doc = pre_delete
300 if doc and not doc[-1].endswith(' '):
301
302 doc[-1] += ' '
303 doc.append('<del>')
304 if balanced and balanced[-1].endswith(' '):
305
306 balanced[-1] = balanced[-1][:-1]
307 doc.extend(balanced)
308 doc.append('</del> ')
309 doc.extend(post_delete)
310 chunks = doc
311 return chunks
312
314 """Return (unbalanced_start, balanced, unbalanced_end), where each is
315 a list of text and tag chunks.
316
317 unbalanced_start is a list of all the tags that are opened, but
318 not closed in this span. Similarly, unbalanced_end is a list of
319 tags that are closed but were not opened. Extracting these might
320 mean some reordering of the chunks."""
321 start = []
322 end = []
323 tag_stack = []
324 balanced = []
325 for chunk in chunks:
326 if not chunk.startswith('<'):
327 balanced.append(chunk)
328 continue
329 endtag = chunk[1] == '/'
330 name = chunk.split()[0].strip('<>/')
331 if name in empty_tags:
332 balanced.append(chunk)
333 continue
334 if endtag:
335 if tag_stack and tag_stack[-1][0] == name:
336 balanced.append(chunk)
337 name, pos, tag = tag_stack.pop()
338 balanced[pos] = tag
339 elif tag_stack:
340 start.extend([tag for name, pos, tag in tag_stack])
341 tag_stack = []
342 end.append(chunk)
343 else:
344 end.append(chunk)
345 else:
346 tag_stack.append((name, len(balanced), chunk))
347 balanced.append(None)
348 start.extend(
349 [chunk for name, pos, chunk in tag_stack])
350 balanced = [chunk for chunk in balanced if chunk is not None]
351 return start, balanced, end
352
354 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
355 stuff_after_DEL_END). Returns the first case found (there may be
356 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
357 there's no DEL_START found. """
358 try:
359 pos = chunks.index(DEL_START)
360 except ValueError:
361 raise NoDeletes
362 pos2 = chunks.index(DEL_END)
363 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
364
366 """ pre_delete and post_delete implicitly point to a place in the
367 document (where the two were split). This moves that point (by
368 popping items from one and pushing them onto the other). It moves
369 the point to try to find a place where unbalanced_start applies.
370
371 As an example::
372
373 >>> unbalanced_start = ['<div>']
374 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
375 >>> pre, post = doc[:3], doc[3:]
376 >>> pre, post
377 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
378 >>> locate_unbalanced_start(unbalanced_start, pre, post)
379 >>> pre, post
380 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
381
382 As you can see, we moved the point so that the dangling <div> that
383 we found will be effectively replaced by the div in the original
384 document. If this doesn't work out, we just throw away
385 unbalanced_start without doing anything.
386 """
387 while 1:
388 if not unbalanced_start:
389
390 break
391 finding = unbalanced_start[0]
392 finding_name = finding.split()[0].strip('<>')
393 if not post_delete:
394 break
395 next = post_delete[0]
396 if next is DEL_START or not next.startswith('<'):
397
398 break
399 if next[1] == '/':
400
401 break
402 name = next.split()[0].strip('<>')
403 if name == 'ins':
404
405 break
406 assert name != 'del', (
407 "Unexpected delete tag: %r" % next)
408 if name == finding_name:
409 unbalanced_start.pop(0)
410 pre_delete.append(post_delete.pop(0))
411 else:
412
413 break
414
416 """ like locate_unbalanced_start, except handling end tags and
417 possibly moving the point earlier in the document. """
418 while 1:
419 if not unbalanced_end:
420
421 break
422 finding = unbalanced_end[-1]
423 finding_name = finding.split()[0].strip('<>/')
424 if not pre_delete:
425 break
426 next = pre_delete[-1]
427 if next is DEL_END or not next.startswith('</'):
428
429 break
430 name = next.split()[0].strip('<>/')
431 if name == 'ins' or name == 'del':
432
433 break
434 if name == finding_name:
435 unbalanced_end.pop()
436 post_delete.insert(0, pre_delete.pop())
437 else:
438
439 break
440
442 """ Represents a diffable token, generally a word that is displayed to
443 the user. Opening tags are attached to this token when they are
444 adjacent (pre_tags) and closing tags that follow the word
445 (post_tags). Some exceptions occur when there are empty tags
446 adjacent to a word, so there may be close tags in pre_tags, or
447 open tags in post_tags.
448
449 We also keep track of whether the word was originally followed by
450 whitespace, even though we do not want to treat the word as
451 equivalent to a similar word that does not have a trailing
452 space."""
453
454
455
456 hide_when_equal = False
457
458 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
459 obj = _unicode.__new__(cls, text)
460
461 if pre_tags is not None:
462 obj.pre_tags = pre_tags
463 else:
464 obj.pre_tags = []
465
466 if post_tags is not None:
467 obj.post_tags = post_tags
468 else:
469 obj.post_tags = []
470
471 obj.trailing_whitespace = trailing_whitespace
472
473 return obj
474
476 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
477 self.post_tags, self.trailing_whitespace)
478
480 return _unicode(self)
481
483
484 """ Represents a token that is actually a tag. Currently this is just
485 the <img> tag, which takes up visible space just like a word but
486 is only represented in a document by a tag. """
487
488 - def __new__(cls, tag, data, html_repr, pre_tags=None,
489 post_tags=None, trailing_whitespace=""):
490 obj = token.__new__(cls, "%s: %s" % (type, data),
491 pre_tags=pre_tags,
492 post_tags=post_tags,
493 trailing_whitespace=trailing_whitespace)
494 obj.tag = tag
495 obj.data = data
496 obj.html_repr = html_repr
497 return obj
498
500 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
501 self.tag,
502 self.data,
503 self.html_repr,
504 self.pre_tags,
505 self.post_tags,
506 self.trailing_whitespace)
508 return self.html_repr
509
511
512 """ Represents the href in an anchor tag. Unlike other words, we only
513 show the href when it changes. """
514
515 hide_when_equal = True
516
518 return ' Link: %s' % self
519
521 """
522 Parse the given HTML and returns token objects (words with attached tags).
523
524 This parses only the content of a page; anything in the head is
525 ignored, and the <head> and <body> elements are themselves
526 optional. The content is then parsed by lxml, which ensures the
527 validity of the resulting parsed document (though lxml may make
528 incorrect guesses when the markup is particular bad).
529
530 <ins> and <del> tags are also eliminated from the document, as
531 that gets confusing.
532
533 If include_hrefs is true, then the href attribute of <a> tags is
534 included as a special kind of diffable token."""
535 if etree.iselement(html):
536 body_el = html
537 else:
538 body_el = parse_html(html, cleanup=True)
539
540 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
541
542 return fixup_chunks(chunks)
543
545 """
546 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
547 wrapped in a <div> tag that was not in the original document.
548
549 If cleanup is true, make sure there's no <head> or <body>, and get
550 rid of any <ins> and <del> tags.
551 """
552 if cleanup:
553
554 html = cleanup_html(html)
555 return fragment_fromstring(html, create_parent=True)
556
557 _body_re = re.compile(r'<body.*?>', re.I|re.S)
558 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
559 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
560
562 """ This 'cleans' the HTML, meaning that any page structure is removed
563 (only the contents of <body> are used, if there is any <body).
564 Also <ins> and <del> tags are removed. """
565 match = _body_re.search(html)
566 if match:
567 html = html[match.end():]
568 match = _end_body_re.search(html)
569 if match:
570 html = html[:match.start()]
571 html = _ins_del_re.sub('', html)
572 return html
573
574
575 end_whitespace_re = re.compile(r'[ \t\n\r]$')
576
578 """
579 This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
580 """
581 stripped_length = len(word.rstrip())
582 return word[0:stripped_length], word[stripped_length:]
583
584
586 """
587 This function takes a list of chunks and produces a list of tokens.
588 """
589 tag_accum = []
590 cur_word = None
591 result = []
592 for chunk in chunks:
593 if isinstance(chunk, tuple):
594 if chunk[0] == 'img':
595 src = chunk[1]
596 tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
597 cur_word = tag_token('img', src, html_repr=tag,
598 pre_tags=tag_accum,
599 trailing_whitespace=trailing_whitespace)
600 tag_accum = []
601 result.append(cur_word)
602
603 elif chunk[0] == 'href':
604 href = chunk[1]
605 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
606 tag_accum = []
607 result.append(cur_word)
608 continue
609
610 if is_word(chunk):
611 chunk, trailing_whitespace = split_trailing_whitespace(chunk)
612 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
613 tag_accum = []
614 result.append(cur_word)
615
616 elif is_start_tag(chunk):
617 tag_accum.append(chunk)
618
619 elif is_end_tag(chunk):
620 if tag_accum:
621 tag_accum.append(chunk)
622 else:
623 assert cur_word, (
624 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
625 % (cur_word, result, chunk, chunks))
626 cur_word.post_tags.append(chunk)
627 else:
628 assert False
629
630 if not result:
631 return [token('', pre_tags=tag_accum)]
632 else:
633 result[-1].post_tags.extend(tag_accum)
634
635 return result
636
637
638
639 empty_tags = (
640 'param', 'img', 'area', 'br', 'basefont', 'input',
641 'base', 'meta', 'link', 'col')
642
643 block_level_tags = (
644 'address',
645 'blockquote',
646 'center',
647 'dir',
648 'div',
649 'dl',
650 'fieldset',
651 'form',
652 'h1',
653 'h2',
654 'h3',
655 'h4',
656 'h5',
657 'h6',
658 'hr',
659 'isindex',
660 'menu',
661 'noframes',
662 'noscript',
663 'ol',
664 'p',
665 'pre',
666 'table',
667 'ul',
668 )
669
670 block_level_container_tags = (
671 'dd',
672 'dt',
673 'frameset',
674 'li',
675 'tbody',
676 'td',
677 'tfoot',
678 'th',
679 'thead',
680 'tr',
681 )
682
683
684 -def flatten_el(el, include_hrefs, skip_tag=False):
685 """ Takes an lxml element el, and generates all the text chunks for
686 that tag. Each start tag is a chunk, each word is a chunk, and each
687 end tag is a chunk.
688
689 If skip_tag is true, then the outermost container tag is
690 not returned (just its contents)."""
691 if not skip_tag:
692 if el.tag == 'img':
693 yield ('img', el.get('src'), start_tag(el))
694 else:
695 yield start_tag(el)
696 if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
697 return
698 start_words = split_words(el.text)
699 for word in start_words:
700 yield html_escape(word)
701 for child in el:
702 for item in flatten_el(child, include_hrefs=include_hrefs):
703 yield item
704 if el.tag == 'a' and el.get('href') and include_hrefs:
705 yield ('href', el.get('href'))
706 if not skip_tag:
707 yield end_tag(el)
708 end_words = split_words(el.tail)
709 for word in end_words:
710 yield html_escape(word)
711
712 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
713
715 """ Splits some text into words. Includes trailing whitespace
716 on each word when appropriate. """
717 if not text or not text.strip():
718 return []
719
720 words = split_words_re.findall(text)
721 return words
722
723 start_whitespace_re = re.compile(r'^[ \t\n\r]')
724
726 """
727 The text representation of the start tag for a tag.
728 """
729 return '<%s%s>' % (
730 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
731 for name, value in el.attrib.items()]))
732
734 """ The text representation of an end tag for a tag. Includes
735 trailing whitespace when appropriate. """
736 if el.tail and start_whitespace_re.search(el.tail):
737 extra = ' '
738 else:
739 extra = ''
740 return '</%s>%s' % (el.tag, extra)
741
743 return not tok.startswith('<')
744
746 return tok.startswith('</')
747
749 return tok.startswith('<') and not tok.startswith('</')
750
759
761 """ Serialize a single lxml element as HTML. The serialized form
762 includes the elements tail.
763
764 If skip_outer is true, then don't serialize the outermost tag
765 """
766 assert not isinstance(el, basestring), (
767 "You should pass in an element, not a string like %r" % el)
768 html = etree.tostring(el, method="html", encoding=_unicode)
769 if skip_outer:
770
771 html = html[html.find('>')+1:]
772
773 html = html[:html.rfind('<')]
774 return html.strip()
775 else:
776 return html
777
787
788
798
830
832 """
833 Removes an element, but merges its contents into its place, e.g.,
834 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
835 <p>Hi there!</p>
836 """
837 parent = el.getparent()
838 text = el.text or ''
839 if el.tail:
840 if not len(el):
841 text += el.tail
842 else:
843 if el[-1].tail:
844 el[-1].tail += el.tail
845 else:
846 el[-1].tail = el.tail
847 index = parent.index(el)
848 if text:
849 if index == 0:
850 previous = None
851 else:
852 previous = parent[index-1]
853 if previous is None:
854 if parent.text:
855 parent.text += text
856 else:
857 parent.text = text
858 else:
859 if previous.tail:
860 previous.tail += text
861 else:
862 previous.tail = text
863 parent[index:index+1] = el.getchildren()
864
866 """
867 Acts like SequenceMatcher, but tries not to find very small equal
868 blocks amidst large spans of changes
869 """
870
871 threshold = 2
872
874 size = min(len(self.b), len(self.b))
875 threshold = min(self.threshold, size / 4)
876 actual = difflib.SequenceMatcher.get_matching_blocks(self)
877 return [item for item in actual
878 if item[2] > threshold
879 or not item[2]]
880
881 if __name__ == '__main__':
882 from lxml.html import _diffcommand
883 _diffcommand.main()
884