1 import difflib
2 from lxml import etree
3 from lxml.html import fragment_fromstring
4 import re
5
6 __all__ = ['html_annotate', 'htmldiff']
7
8 try:
9 from html import escape as html_escape
10 except ImportError:
11 from cgi import escape as html_escape
12 try:
13 _unicode = unicode
14 except NameError:
15
16 _unicode = str
17 try:
18 basestring
19 except NameError:
20
21 basestring = str
22
23
24
25
26
28 return '<span title="%s">%s</span>' % (
29 html_escape(_unicode(version), 1), text)
30
32 """
33 doclist should be ordered from oldest to newest, like::
34
35 >>> version1 = 'Hello World'
36 >>> version2 = 'Goodbye World'
37 >>> print(html_annotate([(version1, 'version 1'),
38 ... (version2, 'version 2')]))
39 <span title="version 2">Goodbye</span> <span title="version 1">World</span>
40
41 The documents must be *fragments* (str/UTF8 or unicode), not
42 complete documents
43
44 The markup argument is a function to markup the spans of words.
45 This function is called like markup('Hello', 'version 2'), and
46 returns HTML. The first argument is text and never includes any
47 markup. The default uses a span with a title:
48
49 >>> print(default_markup('Some Text', 'by Joe'))
50 <span title="by Joe">Some Text</span>
51 """
52
53
54
55
56
57 tokenlist = [tokenize_annotated(doc, version)
58 for doc, version in doclist]
59 cur_tokens = tokenlist[0]
60 for tokens in tokenlist[1:]:
61 html_annotate_merge_annotations(cur_tokens, tokens)
62 cur_tokens = tokens
63
64
65
66 cur_tokens = compress_tokens(cur_tokens)
67
68 result = markup_serialize_tokens(cur_tokens, markup)
69 return ''.join(result).strip()
70
72 """Tokenize a document and add an annotation attribute to each token
73 """
74 tokens = tokenize(doc, include_hrefs=False)
75 for tok in tokens:
76 tok.annotation = annotation
77 return tokens
78
80 """Merge the annotations from tokens_old into tokens_new, when the
81 tokens in the new document already existed in the old document.
82 """
83 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
84 commands = s.get_opcodes()
85
86 for command, i1, i2, j1, j2 in commands:
87 if command == 'equal':
88 eq_old = tokens_old[i1:i2]
89 eq_new = tokens_new[j1:j2]
90 copy_annotations(eq_old, eq_new)
91
93 """
94 Copy annotations from the tokens listed in src to the tokens in dest
95 """
96 assert len(src) == len(dest)
97 for src_tok, dest_tok in zip(src, dest):
98 dest_tok.annotation = src_tok.annotation
99
101 """
102 Combine adjacent tokens when there is no HTML between the tokens,
103 and they share an annotation
104 """
105 result = [tokens[0]]
106 for tok in tokens[1:]:
107 if (not result[-1].post_tags and
108 not tok.pre_tags and
109 result[-1].annotation == tok.annotation):
110 compress_merge_back(result, tok)
111 else:
112 result.append(tok)
113 return result
114
116 """ Merge tok into the last element of tokens (modifying the list of
117 tokens in-place). """
118 last = tokens[-1]
119 if type(last) is not token or type(tok) is not token:
120 tokens.append(tok)
121 else:
122 text = _unicode(last)
123 if last.trailing_whitespace:
124 text += last.trailing_whitespace
125 text += tok
126 merged = token(text,
127 pre_tags=last.pre_tags,
128 post_tags=tok.post_tags,
129 trailing_whitespace=tok.trailing_whitespace)
130 merged.annotation = last.annotation
131 tokens[-1] = merged
132
134 """
135 Serialize the list of tokens into a list of text chunks, calling
136 markup_func around text to add annotations.
137 """
138 for token in tokens:
139 for pre in token.pre_tags:
140 yield pre
141 html = token.html()
142 html = markup_func(html, token.annotation)
143 if token.trailing_whitespace:
144 html += token.trailing_whitespace
145 yield html
146 for post in token.post_tags:
147 yield post
148
149
150
151
152
153
155
156
157 """ Do a diff of the old and new document. The documents are HTML
158 *fragments* (str/UTF8 or unicode), they are not complete documents
159 (i.e., no <html> tag).
160
161 Returns HTML with <ins> and <del> tags added around the
162 appropriate text.
163
164 Markup is generally ignored, with the markup from new_html
165 preserved, and possibly some markup from old_html (though it is
166 considered acceptable to lose some of the old markup). Only the
167 words in the HTML are diffed. The exception is <img> tags, which
168 are treated like words, and the href attribute of <a> tags, which
169 are noted inside the tag itself when there are changes.
170 """
171 old_html_tokens = tokenize(old_html)
172 new_html_tokens = tokenize(new_html)
173 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
174 result = ''.join(result).strip()
175 return fixup_ins_del_tags(result)
176
178 """ Does a diff on the tokens themselves, returning a list of text
179 chunks (not tokens).
180 """
181
182
183
184
185
186
187
188
189
190
191
192
193
194 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
195 commands = s.get_opcodes()
196 result = []
197 for command, i1, i2, j1, j2 in commands:
198 if command == 'equal':
199 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
200 continue
201 if command == 'insert' or command == 'replace':
202 ins_tokens = expand_tokens(html2_tokens[j1:j2])
203 merge_insert(ins_tokens, result)
204 if command == 'delete' or command == 'replace':
205 del_tokens = expand_tokens(html1_tokens[i1:i2])
206 merge_delete(del_tokens, result)
207
208
209
210
211 result = cleanup_delete(result)
212
213 return result
214
216 """Given a list of tokens, return a generator of the chunks of
217 text for the data in the tokens.
218 """
219 for token in tokens:
220 for pre in token.pre_tags:
221 yield pre
222 if not equal or not token.hide_when_equal:
223 if token.trailing_whitespace:
224 yield token.html() + token.trailing_whitespace
225 else:
226 yield token.html()
227 for post in token.post_tags:
228 yield post
229
231 """ doc is the already-handled document (as a list of text chunks);
232 here we add <ins>ins_chunks</ins> to the end of that. """
233
234
235
236 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
237 doc.extend(unbalanced_start)
238 if doc and not doc[-1].endswith(' '):
239
240
241 doc[-1] += ' '
242 doc.append('<ins>')
243 if balanced and balanced[-1].endswith(' '):
244
245 balanced[-1] = balanced[-1][:-1]
246 doc.extend(balanced)
247 doc.append('</ins> ')
248 doc.extend(unbalanced_end)
249
250
251
252
257
259 """ Raised when the document no longer contains any pending deletes
260 (DEL_START/DEL_END) """
261
263 """ Adds the text chunks in del_chunks to the document doc (another
264 list of text chunks) with marker to show it is a delete.
265 cleanup_delete later resolves these markers into <del> tags."""
266 doc.append(DEL_START)
267 doc.extend(del_chunks)
268 doc.append(DEL_END)
269
271 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
272 them with <del></del>. To do this while keeping the document
273 valid, it may need to drop some tags (either start or end tags).
274
275 It may also move the del into adjacent tags to try to move it to a
276 similar location where it was originally located (e.g., moving a
277 delete into preceding <div> tag, if the del looks like (DEL_START,
278 'Text</div>', DEL_END)"""
279 while 1:
280
281
282
283 try:
284 pre_delete, delete, post_delete = split_delete(chunks)
285 except NoDeletes:
286
287 break
288
289
290 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
291
292
293 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
294 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
295 doc = pre_delete
296 if doc and not doc[-1].endswith(' '):
297
298 doc[-1] += ' '
299 doc.append('<del>')
300 if balanced and balanced[-1].endswith(' '):
301
302 balanced[-1] = balanced[-1][:-1]
303 doc.extend(balanced)
304 doc.append('</del> ')
305 doc.extend(post_delete)
306 chunks = doc
307 return chunks
308
310 """Return (unbalanced_start, balanced, unbalanced_end), where each is
311 a list of text and tag chunks.
312
313 unbalanced_start is a list of all the tags that are opened, but
314 not closed in this span. Similarly, unbalanced_end is a list of
315 tags that are closed but were not opened. Extracting these might
316 mean some reordering of the chunks."""
317 start = []
318 end = []
319 tag_stack = []
320 balanced = []
321 for chunk in chunks:
322 if not chunk.startswith('<'):
323 balanced.append(chunk)
324 continue
325 endtag = chunk[1] == '/'
326 name = chunk.split()[0].strip('<>/')
327 if name in empty_tags:
328 balanced.append(chunk)
329 continue
330 if endtag:
331 if tag_stack and tag_stack[-1][0] == name:
332 balanced.append(chunk)
333 name, pos, tag = tag_stack.pop()
334 balanced[pos] = tag
335 elif tag_stack:
336 start.extend([tag for name, pos, tag in tag_stack])
337 tag_stack = []
338 end.append(chunk)
339 else:
340 end.append(chunk)
341 else:
342 tag_stack.append((name, len(balanced), chunk))
343 balanced.append(None)
344 start.extend(
345 [chunk for name, pos, chunk in tag_stack])
346 balanced = [chunk for chunk in balanced if chunk is not None]
347 return start, balanced, end
348
350 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
351 stuff_after_DEL_END). Returns the first case found (there may be
352 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
353 there's no DEL_START found. """
354 try:
355 pos = chunks.index(DEL_START)
356 except ValueError:
357 raise NoDeletes
358 pos2 = chunks.index(DEL_END)
359 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
360
362 """ pre_delete and post_delete implicitly point to a place in the
363 document (where the two were split). This moves that point (by
364 popping items from one and pushing them onto the other). It moves
365 the point to try to find a place where unbalanced_start applies.
366
367 As an example::
368
369 >>> unbalanced_start = ['<div>']
370 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
371 >>> pre, post = doc[:3], doc[3:]
372 >>> pre, post
373 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
374 >>> locate_unbalanced_start(unbalanced_start, pre, post)
375 >>> pre, post
376 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
377
378 As you can see, we moved the point so that the dangling <div> that
379 we found will be effectively replaced by the div in the original
380 document. If this doesn't work out, we just throw away
381 unbalanced_start without doing anything.
382 """
383 while 1:
384 if not unbalanced_start:
385
386 break
387 finding = unbalanced_start[0]
388 finding_name = finding.split()[0].strip('<>')
389 if not post_delete:
390 break
391 next = post_delete[0]
392 if next is DEL_START or not next.startswith('<'):
393
394 break
395 if next[1] == '/':
396
397 break
398 name = next.split()[0].strip('<>')
399 if name == 'ins':
400
401 break
402 assert name != 'del', (
403 "Unexpected delete tag: %r" % next)
404 if name == finding_name:
405 unbalanced_start.pop(0)
406 pre_delete.append(post_delete.pop(0))
407 else:
408
409 break
410
412 """ like locate_unbalanced_start, except handling end tags and
413 possibly moving the point earlier in the document. """
414 while 1:
415 if not unbalanced_end:
416
417 break
418 finding = unbalanced_end[-1]
419 finding_name = finding.split()[0].strip('<>/')
420 if not pre_delete:
421 break
422 next = pre_delete[-1]
423 if next is DEL_END or not next.startswith('</'):
424
425 break
426 name = next.split()[0].strip('<>/')
427 if name == 'ins' or name == 'del':
428
429 break
430 if name == finding_name:
431 unbalanced_end.pop()
432 post_delete.insert(0, pre_delete.pop())
433 else:
434
435 break
436
438 """ Represents a diffable token, generally a word that is displayed to
439 the user. Opening tags are attached to this token when they are
440 adjacent (pre_tags) and closing tags that follow the word
441 (post_tags). Some exceptions occur when there are empty tags
442 adjacent to a word, so there may be close tags in pre_tags, or
443 open tags in post_tags.
444
445 We also keep track of whether the word was originally followed by
446 whitespace, even though we do not want to treat the word as
447 equivalent to a similar word that does not have a trailing
448 space."""
449
450
451
452 hide_when_equal = False
453
454 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
455 obj = _unicode.__new__(cls, text)
456
457 if pre_tags is not None:
458 obj.pre_tags = pre_tags
459 else:
460 obj.pre_tags = []
461
462 if post_tags is not None:
463 obj.post_tags = post_tags
464 else:
465 obj.post_tags = []
466
467 obj.trailing_whitespace = trailing_whitespace
468
469 return obj
470
472 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
473 self.post_tags, self.trailing_whitespace)
474
476 return _unicode(self)
477
479
480 """ Represents a token that is actually a tag. Currently this is just
481 the <img> tag, which takes up visible space just like a word but
482 is only represented in a document by a tag. """
483
484 - def __new__(cls, tag, data, html_repr, pre_tags=None,
485 post_tags=None, trailing_whitespace=""):
486 obj = token.__new__(cls, "%s: %s" % (type, data),
487 pre_tags=pre_tags,
488 post_tags=post_tags,
489 trailing_whitespace=trailing_whitespace)
490 obj.tag = tag
491 obj.data = data
492 obj.html_repr = html_repr
493 return obj
494
496 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
497 self.tag,
498 self.data,
499 self.html_repr,
500 self.pre_tags,
501 self.post_tags,
502 self.trailing_whitespace)
504 return self.html_repr
505
507
508 """ Represents the href in an anchor tag. Unlike other words, we only
509 show the href when it changes. """
510
511 hide_when_equal = True
512
514 return ' Link: %s' % self
515
517 """
518 Parse the given HTML and returns token objects (words with attached tags).
519
520 This parses only the content of a page; anything in the head is
521 ignored, and the <head> and <body> elements are themselves
522 optional. The content is then parsed by lxml, which ensures the
523 validity of the resulting parsed document (though lxml may make
524 incorrect guesses when the markup is particular bad).
525
526 <ins> and <del> tags are also eliminated from the document, as
527 that gets confusing.
528
529 If include_hrefs is true, then the href attribute of <a> tags is
530 included as a special kind of diffable token."""
531 if etree.iselement(html):
532 body_el = html
533 else:
534 body_el = parse_html(html, cleanup=True)
535
536 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
537
538 return fixup_chunks(chunks)
539
541 """
542 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
543 wrapped in a <div> tag that was not in the original document.
544
545 If cleanup is true, make sure there's no <head> or <body>, and get
546 rid of any <ins> and <del> tags.
547 """
548 if cleanup:
549
550 html = cleanup_html(html)
551 return fragment_fromstring(html, create_parent=True)
552
553 _body_re = re.compile(r'<body.*?>', re.I|re.S)
554 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
555 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
556
558 """ This 'cleans' the HTML, meaning that any page structure is removed
559 (only the contents of <body> are used, if there is any <body).
560 Also <ins> and <del> tags are removed. """
561 match = _body_re.search(html)
562 if match:
563 html = html[match.end():]
564 match = _end_body_re.search(html)
565 if match:
566 html = html[:match.start()]
567 html = _ins_del_re.sub('', html)
568 return html
569
570
571 end_whitespace_re = re.compile(r'[ \t\n\r]$')
572
574 """
575 This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
576 """
577 stripped_length = len(word.rstrip())
578 return word[0:stripped_length], word[stripped_length:]
579
580
582 """
583 This function takes a list of chunks and produces a list of tokens.
584 """
585 tag_accum = []
586 cur_word = None
587 result = []
588 for chunk in chunks:
589 if isinstance(chunk, tuple):
590 if chunk[0] == 'img':
591 src = chunk[1]
592 tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
593 cur_word = tag_token('img', src, html_repr=tag,
594 pre_tags=tag_accum,
595 trailing_whitespace=trailing_whitespace)
596 tag_accum = []
597 result.append(cur_word)
598
599 elif chunk[0] == 'href':
600 href = chunk[1]
601 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
602 tag_accum = []
603 result.append(cur_word)
604 continue
605
606 if is_word(chunk):
607 chunk, trailing_whitespace = split_trailing_whitespace(chunk)
608 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
609 tag_accum = []
610 result.append(cur_word)
611
612 elif is_start_tag(chunk):
613 tag_accum.append(chunk)
614
615 elif is_end_tag(chunk):
616 if tag_accum:
617 tag_accum.append(chunk)
618 else:
619 assert cur_word, (
620 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
621 % (cur_word, result, chunk, chunks))
622 cur_word.post_tags.append(chunk)
623 else:
624 assert(0)
625
626 if not result:
627 return [token('', pre_tags=tag_accum)]
628 else:
629 result[-1].post_tags.extend(tag_accum)
630
631 return result
632
633
634
635 empty_tags = (
636 'param', 'img', 'area', 'br', 'basefont', 'input',
637 'base', 'meta', 'link', 'col')
638
639 block_level_tags = (
640 'address',
641 'blockquote',
642 'center',
643 'dir',
644 'div',
645 'dl',
646 'fieldset',
647 'form',
648 'h1',
649 'h2',
650 'h3',
651 'h4',
652 'h5',
653 'h6',
654 'hr',
655 'isindex',
656 'menu',
657 'noframes',
658 'noscript',
659 'ol',
660 'p',
661 'pre',
662 'table',
663 'ul',
664 )
665
666 block_level_container_tags = (
667 'dd',
668 'dt',
669 'frameset',
670 'li',
671 'tbody',
672 'td',
673 'tfoot',
674 'th',
675 'thead',
676 'tr',
677 )
678
679
680 -def flatten_el(el, include_hrefs, skip_tag=False):
681 """ Takes an lxml element el, and generates all the text chunks for
682 that tag. Each start tag is a chunk, each word is a chunk, and each
683 end tag is a chunk.
684
685 If skip_tag is true, then the outermost container tag is
686 not returned (just its contents)."""
687 if not skip_tag:
688 if el.tag == 'img':
689 yield ('img', el.get('src'), start_tag(el))
690 else:
691 yield start_tag(el)
692 if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
693 return
694 start_words = split_words(el.text)
695 for word in start_words:
696 yield html_escape(word)
697 for child in el:
698 for item in flatten_el(child, include_hrefs=include_hrefs):
699 yield item
700 if el.tag == 'a' and el.get('href') and include_hrefs:
701 yield ('href', el.get('href'))
702 if not skip_tag:
703 yield end_tag(el)
704 end_words = split_words(el.tail)
705 for word in end_words:
706 yield html_escape(word)
707
708 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
709
711 """ Splits some text into words. Includes trailing whitespace
712 on each word when appropriate. """
713 if not text or not text.strip():
714 return []
715
716 words = split_words_re.findall(text)
717 return words
718
719 start_whitespace_re = re.compile(r'^[ \t\n\r]')
720
722 """
723 The text representation of the start tag for a tag.
724 """
725 return '<%s%s>' % (
726 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
727 for name, value in el.attrib.items()]))
728
730 """ The text representation of an end tag for a tag. Includes
731 trailing whitespace when appropriate. """
732 if el.tail and start_whitespace_re.search(el.tail):
733 extra = ' '
734 else:
735 extra = ''
736 return '</%s>%s' % (el.tag, extra)
737
739 return not tok.startswith('<')
740
742 return tok.startswith('</')
743
745 return tok.startswith('<') and not tok.startswith('</')
746
755
757 """ Serialize a single lxml element as HTML. The serialized form
758 includes the elements tail.
759
760 If skip_outer is true, then don't serialize the outermost tag
761 """
762 assert not isinstance(el, basestring), (
763 "You should pass in an element, not a string like %r" % el)
764 html = etree.tostring(el, method="html", encoding=_unicode)
765 if skip_outer:
766
767 html = html[html.find('>')+1:]
768
769 html = html[:html.rfind('<')]
770 return html.strip()
771 else:
772 return html
773
783
784
794
827
829 """
830 Removes an element, but merges its contents into its place, e.g.,
831 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
832 <p>Hi there!</p>
833 """
834 parent = el.getparent()
835 text = el.text or ''
836 if el.tail:
837 if not len(el):
838 text += el.tail
839 else:
840 if el[-1].tail:
841 el[-1].tail += el.tail
842 else:
843 el[-1].tail = el.tail
844 index = parent.index(el)
845 if text:
846 if index == 0:
847 previous = None
848 else:
849 previous = parent[index-1]
850 if previous is None:
851 if parent.text:
852 parent.text += text
853 else:
854 parent.text = text
855 else:
856 if previous.tail:
857 previous.tail += text
858 else:
859 previous.tail = text
860 parent[index:index+1] = el.getchildren()
861
863 """
864 Acts like SequenceMatcher, but tries not to find very small equal
865 blocks amidst large spans of changes
866 """
867
868 threshold = 2
869
871 size = min(len(self.b), len(self.b))
872 threshold = min(self.threshold, size / 4)
873 actual = difflib.SequenceMatcher.get_matching_blocks(self)
874 return [item for item in actual
875 if item[2] > threshold
876 or not item[2]]
877
878 if __name__ == '__main__':
879 from lxml.html import _diffcommand
880 _diffcommand.main()
881