1 import difflib
2 from lxml import etree
3 from lxml.html import fragment_fromstring
4 import cgi
5 import re
6
7 __all__ = ['html_annotate', 'htmldiff']
8
9 try:
10 _unicode = unicode
11 except NameError:
12
13 _unicode = str
14 try:
15 basestring = __builtins__["basestring"]
16 except (KeyError, NameError):
17
18 basestring = str
19
20
21
22
23
25 return '<span title="%s">%s</span>' % (
26 cgi.escape(_unicode(version), 1), text)
27
29 """
30 doclist should be ordered from oldest to newest, like::
31
32 >>> version1 = 'Hello World'
33 >>> version2 = 'Goodbye World'
34 >>> print(html_annotate([(version1, 'version 1'),
35 ... (version2, 'version 2')]))
36 <span title="version 2">Goodbye</span> <span title="version 1">World</span>
37
38 The documents must be *fragments* (str/UTF8 or unicode), not
39 complete documents
40
41 The markup argument is a function to markup the spans of words.
42 This function is called like markup('Hello', 'version 2'), and
43 returns HTML. The first argument is text and never includes any
44 markup. The default uses a span with a title:
45
46 >>> print(default_markup('Some Text', 'by Joe'))
47 <span title="by Joe">Some Text</span>
48 """
49
50
51
52
53
54 tokenlist = [tokenize_annotated(doc, version)
55 for doc, version in doclist]
56 cur_tokens = tokenlist[0]
57 for tokens in tokenlist[1:]:
58 html_annotate_merge_annotations(cur_tokens, tokens)
59 cur_tokens = tokens
60
61
62
63 cur_tokens = compress_tokens(cur_tokens)
64
65 result = markup_serialize_tokens(cur_tokens, markup)
66 return ''.join(result).strip()
67
69 """Tokenize a document and add an annotation attribute to each token
70 """
71 tokens = tokenize(doc, include_hrefs=False)
72 for tok in tokens:
73 tok.annotation = annotation
74 return tokens
75
77 """Merge the annotations from tokens_old into tokens_new, when the
78 tokens in the new document already existed in the old document.
79 """
80 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
81 commands = s.get_opcodes()
82
83 for command, i1, i2, j1, j2 in commands:
84 if command == 'equal':
85 eq_old = tokens_old[i1:i2]
86 eq_new = tokens_new[j1:j2]
87 copy_annotations(eq_old, eq_new)
88
90 """
91 Copy annotations from the tokens listed in src to the tokens in dest
92 """
93 assert len(src) == len(dest)
94 for src_tok, dest_tok in zip(src, dest):
95 dest_tok.annotation = src_tok.annotation
96
98 """
99 Combine adjacent tokens when there is no HTML between the tokens,
100 and they share an annotation
101 """
102 result = [tokens[0]]
103 for tok in tokens[1:]:
104 if (not result[-1].post_tags and
105 not tok.pre_tags and
106 result[-1].annotation == tok.annotation):
107 compress_merge_back(result, tok)
108 else:
109 result.append(tok)
110 return result
111
113 """ Merge tok into the last element of tokens (modifying the list of
114 tokens in-place). """
115 last = tokens[-1]
116 if type(last) is not token or type(tok) is not token:
117 tokens.append(tok)
118 else:
119 text = _unicode(last)
120 if last.trailing_whitespace:
121 text += ' '
122 text += tok
123 merged = token(text,
124 pre_tags=last.pre_tags,
125 post_tags=tok.post_tags,
126 trailing_whitespace=tok.trailing_whitespace)
127 merged.annotation = last.annotation
128 tokens[-1] = merged
129
131 """
132 Serialize the list of tokens into a list of text chunks, calling
133 markup_func around text to add annotations.
134 """
135 for token in tokens:
136 for pre in token.pre_tags:
137 yield pre
138 html = token.html()
139 html = markup_func(html, token.annotation)
140 if token.trailing_whitespace:
141 html += ' '
142 yield html
143 for post in token.post_tags:
144 yield post
145
146
147
148
149
150
152
153
154 """ Do a diff of the old and new document. The documents are HTML
155 *fragments* (str/UTF8 or unicode), they are not complete documents
156 (i.e., no <html> tag).
157
158 Returns HTML with <ins> and <del> tags added around the
159 appropriate text.
160
161 Markup is generally ignored, with the markup from new_html
162 preserved, and possibly some markup from old_html (though it is
163 considered acceptable to lose some of the old markup). Only the
164 words in the HTML are diffed. The exception is <img> tags, which
165 are treated like words, and the href attribute of <a> tags, which
166 are noted inside the tag itself when there are changes.
167 """
168 old_html_tokens = tokenize(old_html)
169 new_html_tokens = tokenize(new_html)
170 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
171 result = ''.join(result).strip()
172 return fixup_ins_del_tags(result)
173
175 """ Does a diff on the tokens themselves, returning a list of text
176 chunks (not tokens).
177 """
178
179
180
181
182
183
184
185
186
187
188
189
190
191 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
192 commands = s.get_opcodes()
193 result = []
194 for command, i1, i2, j1, j2 in commands:
195 if command == 'equal':
196 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
197 continue
198 if command == 'insert' or command == 'replace':
199 ins_tokens = expand_tokens(html2_tokens[j1:j2])
200 merge_insert(ins_tokens, result)
201 if command == 'delete' or command == 'replace':
202 del_tokens = expand_tokens(html1_tokens[i1:i2])
203 merge_delete(del_tokens, result)
204
205
206
207
208 result = cleanup_delete(result)
209
210 return result
211
213 """Given a list of tokens, return a generator of the chunks of
214 text for the data in the tokens.
215 """
216 for token in tokens:
217 for pre in token.pre_tags:
218 yield pre
219 if not equal or not token.hide_when_equal:
220 if token.trailing_whitespace:
221 yield token.html() + ' '
222 else:
223 yield token.html()
224 for post in token.post_tags:
225 yield post
226
228 """ doc is the already-handled document (as a list of text chunks);
229 here we add <ins>ins_chunks</ins> to the end of that. """
230
231
232
233 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
234 doc.extend(unbalanced_start)
235 if doc and not doc[-1].endswith(' '):
236
237
238 doc[-1] += ' '
239 doc.append('<ins>')
240 if balanced and balanced[-1].endswith(' '):
241
242 balanced[-1] = balanced[-1][:-1]
243 doc.extend(balanced)
244 doc.append('</ins> ')
245 doc.extend(unbalanced_end)
246
247
248
249
254
256 """ Raised when the document no longer contains any pending deletes
257 (DEL_START/DEL_END) """
258
260 """ Adds the text chunks in del_chunks to the document doc (another
261 list of text chunks) with marker to show it is a delete.
262 cleanup_delete later resolves these markers into <del> tags."""
263 doc.append(DEL_START)
264 doc.extend(del_chunks)
265 doc.append(DEL_END)
266
268 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
269 them with <del></del>. To do this while keeping the document
270 valid, it may need to drop some tags (either start or end tags).
271
272 It may also move the del into adjacent tags to try to move it to a
273 similar location where it was originally located (e.g., moving a
274 delete into preceding <div> tag, if the del looks like (DEL_START,
275 'Text</div>', DEL_END)"""
276 while 1:
277
278
279
280 try:
281 pre_delete, delete, post_delete = split_delete(chunks)
282 except NoDeletes:
283
284 break
285
286
287 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
288
289
290 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
291 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
292 doc = pre_delete
293 if doc and not doc[-1].endswith(' '):
294
295 doc[-1] += ' '
296 doc.append('<del>')
297 if balanced and balanced[-1].endswith(' '):
298
299 balanced[-1] = balanced[-1][:-1]
300 doc.extend(balanced)
301 doc.append('</del> ')
302 doc.extend(post_delete)
303 chunks = doc
304 return chunks
305
307 """Return (unbalanced_start, balanced, unbalanced_end), where each is
308 a list of text and tag chunks.
309
310 unbalanced_start is a list of all the tags that are opened, but
311 not closed in this span. Similarly, unbalanced_end is a list of
312 tags that are closed but were not opened. Extracting these might
313 mean some reordering of the chunks."""
314 start = []
315 end = []
316 tag_stack = []
317 balanced = []
318 for chunk in chunks:
319 if not chunk.startswith('<'):
320 balanced.append(chunk)
321 continue
322 endtag = chunk[1] == '/'
323 name = chunk.split()[0].strip('<>/')
324 if name in empty_tags:
325 balanced.append(chunk)
326 continue
327 if endtag:
328 if tag_stack and tag_stack[-1][0] == name:
329 balanced.append(chunk)
330 name, pos, tag = tag_stack.pop()
331 balanced[pos] = tag
332 elif tag_stack:
333 start.extend([tag for name, pos, tag in tag_stack])
334 tag_stack = []
335 end.append(chunk)
336 else:
337 end.append(chunk)
338 else:
339 tag_stack.append((name, len(balanced), chunk))
340 balanced.append(None)
341 start.extend(
342 [chunk for name, pos, chunk in tag_stack])
343 balanced = [chunk for chunk in balanced if chunk is not None]
344 return start, balanced, end
345
347 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
348 stuff_after_DEL_END). Returns the first case found (there may be
349 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
350 there's no DEL_START found. """
351 try:
352 pos = chunks.index(DEL_START)
353 except ValueError:
354 raise NoDeletes
355 pos2 = chunks.index(DEL_END)
356 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
357
359 """ pre_delete and post_delete implicitly point to a place in the
360 document (where the two were split). This moves that point (by
361 popping items from one and pushing them onto the other). It moves
362 the point to try to find a place where unbalanced_start applies.
363
364 As an example::
365
366 >>> unbalanced_start = ['<div>']
367 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
368 >>> pre, post = doc[:3], doc[3:]
369 >>> pre, post
370 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
371 >>> locate_unbalanced_start(unbalanced_start, pre, post)
372 >>> pre, post
373 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
374
375 As you can see, we moved the point so that the dangling <div> that
376 we found will be effectively replaced by the div in the original
377 document. If this doesn't work out, we just throw away
378 unbalanced_start without doing anything.
379 """
380 while 1:
381 if not unbalanced_start:
382
383 break
384 finding = unbalanced_start[0]
385 finding_name = finding.split()[0].strip('<>')
386 if not post_delete:
387 break
388 next = post_delete[0]
389 if next is DEL_START or not next.startswith('<'):
390
391 break
392 if next[1] == '/':
393
394 break
395 name = next.split()[0].strip('<>')
396 if name == 'ins':
397
398 break
399 assert name != 'del', (
400 "Unexpected delete tag: %r" % next)
401 if name == finding_name:
402 unbalanced_start.pop(0)
403 pre_delete.append(post_delete.pop(0))
404 else:
405
406 break
407
409 """ like locate_unbalanced_start, except handling end tags and
410 possibly moving the point earlier in the document. """
411 while 1:
412 if not unbalanced_end:
413
414 break
415 finding = unbalanced_end[-1]
416 finding_name = finding.split()[0].strip('<>/')
417 if not pre_delete:
418 break
419 next = pre_delete[-1]
420 if next is DEL_END or not next.startswith('</'):
421
422 break
423 name = next.split()[0].strip('<>/')
424 if name == 'ins' or name == 'del':
425
426 break
427 if name == finding_name:
428 unbalanced_end.pop()
429 post_delete.insert(0, pre_delete.pop())
430 else:
431
432 break
433
435 """ Represents a diffable token, generally a word that is displayed to
436 the user. Opening tags are attached to this token when they are
437 adjacent (pre_tags) and closing tags that follow the word
438 (post_tags). Some exceptions occur when there are empty tags
439 adjacent to a word, so there may be close tags in pre_tags, or
440 open tags in post_tags.
441
442 We also keep track of whether the word was originally followed by
443 whitespace, even though we do not want to treat the word as
444 equivalent to a similar word that does not have a trailing
445 space."""
446
447
448
449 hide_when_equal = False
450
451 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
452 obj = _unicode.__new__(cls, text)
453
454 if pre_tags is not None:
455 obj.pre_tags = pre_tags
456 else:
457 obj.pre_tags = []
458
459 if post_tags is not None:
460 obj.post_tags = post_tags
461 else:
462 obj.post_tags = []
463
464 obj.trailing_whitespace = trailing_whitespace
465
466 return obj
467
469 return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
470
472 return _unicode(self)
473
475
476 """ Represents a token that is actually a tag. Currently this is just
477 the <img> tag, which takes up visible space just like a word but
478 is only represented in a document by a tag. """
479
480 - def __new__(cls, tag, data, html_repr, pre_tags=None,
481 post_tags=None, trailing_whitespace=False):
482 obj = token.__new__(cls, "%s: %s" % (type, data),
483 pre_tags=pre_tags,
484 post_tags=post_tags,
485 trailing_whitespace=trailing_whitespace)
486 obj.tag = tag
487 obj.data = data
488 obj.html_repr = html_repr
489 return obj
490
492 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
493 self.tag,
494 self.data,
495 self.html_repr,
496 self.pre_tags,
497 self.post_tags,
498 self.trailing_whitespace)
500 return self.html_repr
501
503
504 """ Represents the href in an anchor tag. Unlike other words, we only
505 show the href when it changes. """
506
507 hide_when_equal = True
508
510 return 'Link: %s' % self
511
513 """
514 Parse the given HTML and returns token objects (words with attached tags).
515
516 This parses only the content of a page; anything in the head is
517 ignored, and the <head> and <body> elements are themselves
518 optional. The content is then parsed by lxml, which ensures the
519 validity of the resulting parsed document (though lxml may make
520 incorrect guesses when the markup is particular bad).
521
522 <ins> and <del> tags are also eliminated from the document, as
523 that gets confusing.
524
525 If include_hrefs is true, then the href attribute of <a> tags is
526 included as a special kind of diffable token."""
527 body_el = parse_html(html, cleanup=True)
528
529 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
530
531 return fixup_chunks(chunks)
532
534 """
535 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
536 wrapped in a <div> tag that was not in the original document.
537
538 If cleanup is true, make sure there's no <head> or <body>, and get
539 rid of any <ins> and <del> tags.
540 """
541 if cleanup:
542
543 html = cleanup_html(html)
544 return fragment_fromstring(html, create_parent=True)
545
546 _body_re = re.compile(r'<body.*?>', re.I|re.S)
547 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
548 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
549
562
563
564 end_whitespace_re = re.compile(r'[ \t\n\r]$')
565
567 """
568 This function takes a list of chunks and produces a list of tokens.
569 """
570 tag_accum = []
571 cur_word = None
572 result = []
573 for chunk in chunks:
574 if isinstance(chunk, tuple):
575 if chunk[0] == 'img':
576 src = chunk[1]
577 tag = chunk[2]
578 if tag.endswith(' '):
579 tag = tag[:-1]
580 trailing_whitespace = True
581 else:
582 trailing_whitespace = False
583 cur_word = tag_token('img', src, html_repr=tag,
584 pre_tags=tag_accum,
585 trailing_whitespace=trailing_whitespace)
586 tag_accum = []
587 result.append(cur_word)
588 elif chunk[0] == 'href':
589 href = chunk[1]
590 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
591 tag_accum = []
592 result.append(cur_word)
593 continue
594 if is_word(chunk):
595 if chunk.endswith(' '):
596 chunk = chunk[:-1]
597 trailing_whitespace = True
598 else:
599 trailing_whitespace = False
600 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
601 tag_accum = []
602 result.append(cur_word)
603 elif is_start_tag(chunk):
604 tag_accum.append(chunk)
605 elif is_end_tag(chunk):
606 if tag_accum:
607 tag_accum.append(chunk)
608 else:
609 assert cur_word, (
610 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
611 % (cur_word, result, chunk, chunks))
612 cur_word.post_tags.append(chunk)
613 else:
614 assert(0)
615
616 if not result:
617 return [token('', pre_tags=tag_accum)]
618 else:
619 result[-1].post_tags.extend(tag_accum)
620
621 return result
622
623
624
625 empty_tags = (
626 'param', 'img', 'area', 'br', 'basefont', 'input',
627 'base', 'meta', 'link', 'col')
628
629 block_level_tags = (
630 'address',
631 'blockquote',
632 'center',
633 'dir',
634 'div',
635 'dl',
636 'fieldset',
637 'form',
638 'h1',
639 'h2',
640 'h3',
641 'h4',
642 'h5',
643 'h6',
644 'hr',
645 'isindex',
646 'menu',
647 'noframes',
648 'noscript',
649 'ol',
650 'p',
651 'pre',
652 'table',
653 'ul',
654 )
655
656 block_level_container_tags = (
657 'dd',
658 'dt',
659 'frameset',
660 'li',
661 'tbody',
662 'td',
663 'tfoot',
664 'th',
665 'thead',
666 'tr',
667 )
668
669
670 -def flatten_el(el, include_hrefs, skip_tag=False):
671 """ Takes an lxml element el, and generates all the text chunks for
672 that tag. Each start tag is a chunk, each word is a chunk, and each
673 end tag is a chunk.
674
675 If skip_tag is true, then the outermost container tag is
676 not returned (just its contents)."""
677 if not skip_tag:
678 if el.tag == 'img':
679 yield ('img', el.attrib['src'], start_tag(el))
680 else:
681 yield start_tag(el)
682 if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
683 return
684 start_words = split_words(el.text)
685 for word in start_words:
686 yield cgi.escape(word)
687 for child in el:
688 for item in flatten_el(child, include_hrefs=include_hrefs):
689 yield item
690 if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
691 yield ('href', el.attrib['href'])
692 if not skip_tag:
693 yield end_tag(el)
694 end_words = split_words(el.tail)
695 for word in end_words:
696 yield cgi.escape(word)
697
699 """ Splits some text into words. Includes trailing whitespace (one
700 space) on each word when appropriate. """
701 if not text or not text.strip():
702 return []
703 words = [w + ' ' for w in text.strip().split()]
704 if not end_whitespace_re.search(text):
705 words[-1] = words[-1][:-1]
706 return words
707
708 start_whitespace_re = re.compile(r'^[ \t\n\r]')
709
711 """
712 The text representation of the start tag for a tag.
713 """
714 return '<%s%s>' % (
715 el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True))
716 for name, value in el.attrib.items()]))
717
719 """ The text representation of an end tag for a tag. Includes
720 trailing whitespace when appropriate. """
721 if el.tail and start_whitespace_re.search(el.tail):
722 extra = ' '
723 else:
724 extra = ''
725 return '</%s>%s' % (el.tag, extra)
726
728 return not tok.startswith('<')
729
731 return tok.startswith('</')
732
734 return tok.startswith('<') and not tok.startswith('</')
735
744
746 """ Serialize a single lxml element as HTML. The serialized form
747 includes the elements tail.
748
749 If skip_outer is true, then don't serialize the outermost tag
750 """
751 assert not isinstance(el, basestring), (
752 "You should pass in an element, not a string like %r" % el)
753 html = etree.tostring(el, method="html", encoding=_unicode)
754 if skip_outer:
755
756 html = html[html.find('>')+1:]
757
758 html = html[:html.rfind('<')]
759 return html.strip()
760 else:
761 return html
762
772
773
783
816
818 """
819 Removes an element, but merges its contents into its place, e.g.,
820 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
821 <p>Hi there!</p>
822 """
823 parent = el.getparent()
824 text = el.text or ''
825 if el.tail:
826 if not len(el):
827 text += el.tail
828 else:
829 if el[-1].tail:
830 el[-1].tail += el.tail
831 else:
832 el[-1].tail = el.tail
833 index = parent.index(el)
834 if text:
835 if index == 0:
836 previous = None
837 else:
838 previous = parent[index-1]
839 if previous is None:
840 if parent.text:
841 parent.text += text
842 else:
843 parent.text = text
844 else:
845 if previous.tail:
846 previous.tail += text
847 else:
848 previous.tail = text
849 parent[index:index+1] = el.getchildren()
850
852 """
853 Acts like SequenceMatcher, but tries not to find very small equal
854 blocks amidst large spans of changes
855 """
856
857 threshold = 2
858
860 size = min(len(self.b), len(self.b))
861 threshold = min(self.threshold, size / 4)
862 actual = difflib.SequenceMatcher.get_matching_blocks(self)
863 return [item for item in actual
864 if item[2] > threshold
865 or not item[2]]
866
867 if __name__ == '__main__':
868 from lxml.html import _diffcommand
869 _diffcommand.main()
870