1 from __future__ import absolute_import
2
3 import difflib
4 from lxml import etree
5 from lxml.html import fragment_fromstring
6 import re
7
8 __all__ = ['html_annotate', 'htmldiff']
9
10 try:
11 from html import escape as html_escape
12 except ImportError:
13 from cgi import escape as html_escape
14 try:
15 _unicode = unicode
16 except NameError:
17
18 _unicode = str
19 try:
20 basestring
21 except NameError:
22
23 basestring = str
24
25
26
27
28
30 return '<span title="%s">%s</span>' % (
31 html_escape(_unicode(version), 1), text)
32
34 """
35 doclist should be ordered from oldest to newest, like::
36
37 >>> version1 = 'Hello World'
38 >>> version2 = 'Goodbye World'
39 >>> print(html_annotate([(version1, 'version 1'),
40 ... (version2, 'version 2')]))
41 <span title="version 2">Goodbye</span> <span title="version 1">World</span>
42
43 The documents must be *fragments* (str/UTF8 or unicode), not
44 complete documents
45
46 The markup argument is a function to markup the spans of words.
47 This function is called like markup('Hello', 'version 2'), and
48 returns HTML. The first argument is text and never includes any
49 markup. The default uses a span with a title:
50
51 >>> print(default_markup('Some Text', 'by Joe'))
52 <span title="by Joe">Some Text</span>
53 """
54
55
56
57
58
59 tokenlist = [tokenize_annotated(doc, version)
60 for doc, version in doclist]
61 cur_tokens = tokenlist[0]
62 for tokens in tokenlist[1:]:
63 html_annotate_merge_annotations(cur_tokens, tokens)
64 cur_tokens = tokens
65
66
67
68 cur_tokens = compress_tokens(cur_tokens)
69
70 result = markup_serialize_tokens(cur_tokens, markup)
71 return ''.join(result).strip()
72
74 """Tokenize a document and add an annotation attribute to each token
75 """
76 tokens = tokenize(doc, include_hrefs=False)
77 for tok in tokens:
78 tok.annotation = annotation
79 return tokens
80
82 """Merge the annotations from tokens_old into tokens_new, when the
83 tokens in the new document already existed in the old document.
84 """
85 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
86 commands = s.get_opcodes()
87
88 for command, i1, i2, j1, j2 in commands:
89 if command == 'equal':
90 eq_old = tokens_old[i1:i2]
91 eq_new = tokens_new[j1:j2]
92 copy_annotations(eq_old, eq_new)
93
95 """
96 Copy annotations from the tokens listed in src to the tokens in dest
97 """
98 assert len(src) == len(dest)
99 for src_tok, dest_tok in zip(src, dest):
100 dest_tok.annotation = src_tok.annotation
101
103 """
104 Combine adjacent tokens when there is no HTML between the tokens,
105 and they share an annotation
106 """
107 result = [tokens[0]]
108 for tok in tokens[1:]:
109 if (not result[-1].post_tags and
110 not tok.pre_tags and
111 result[-1].annotation == tok.annotation):
112 compress_merge_back(result, tok)
113 else:
114 result.append(tok)
115 return result
116
118 """ Merge tok into the last element of tokens (modifying the list of
119 tokens in-place). """
120 last = tokens[-1]
121 if type(last) is not token or type(tok) is not token:
122 tokens.append(tok)
123 else:
124 text = _unicode(last)
125 if last.trailing_whitespace:
126 text += last.trailing_whitespace
127 text += tok
128 merged = token(text,
129 pre_tags=last.pre_tags,
130 post_tags=tok.post_tags,
131 trailing_whitespace=tok.trailing_whitespace)
132 merged.annotation = last.annotation
133 tokens[-1] = merged
134
136 """
137 Serialize the list of tokens into a list of text chunks, calling
138 markup_func around text to add annotations.
139 """
140 for token in tokens:
141 for pre in token.pre_tags:
142 yield pre
143 html = token.html()
144 html = markup_func(html, token.annotation)
145 if token.trailing_whitespace:
146 html += token.trailing_whitespace
147 yield html
148 for post in token.post_tags:
149 yield post
150
151
152
153
154
155
157
158
159 """ Do a diff of the old and new document. The documents are HTML
160 *fragments* (str/UTF8 or unicode), they are not complete documents
161 (i.e., no <html> tag).
162
163 Returns HTML with <ins> and <del> tags added around the
164 appropriate text.
165
166 Markup is generally ignored, with the markup from new_html
167 preserved, and possibly some markup from old_html (though it is
168 considered acceptable to lose some of the old markup). Only the
169 words in the HTML are diffed. The exception is <img> tags, which
170 are treated like words, and the href attribute of <a> tags, which
171 are noted inside the tag itself when there are changes.
172 """
173 old_html_tokens = tokenize(old_html)
174 new_html_tokens = tokenize(new_html)
175 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
176 result = ''.join(result).strip()
177 return fixup_ins_del_tags(result)
178
180 """ Does a diff on the tokens themselves, returning a list of text
181 chunks (not tokens).
182 """
183
184
185
186
187
188
189
190
191
192
193
194
195
196 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
197 commands = s.get_opcodes()
198 result = []
199 for command, i1, i2, j1, j2 in commands:
200 if command == 'equal':
201 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
202 continue
203 if command == 'insert' or command == 'replace':
204 ins_tokens = expand_tokens(html2_tokens[j1:j2])
205 merge_insert(ins_tokens, result)
206 if command == 'delete' or command == 'replace':
207 del_tokens = expand_tokens(html1_tokens[i1:i2])
208 merge_delete(del_tokens, result)
209
210
211
212
213 result = cleanup_delete(result)
214
215 return result
216
218 """Given a list of tokens, return a generator of the chunks of
219 text for the data in the tokens.
220 """
221 for token in tokens:
222 for pre in token.pre_tags:
223 yield pre
224 if not equal or not token.hide_when_equal:
225 if token.trailing_whitespace:
226 yield token.html() + token.trailing_whitespace
227 else:
228 yield token.html()
229 for post in token.post_tags:
230 yield post
231
233 """ doc is the already-handled document (as a list of text chunks);
234 here we add <ins>ins_chunks</ins> to the end of that. """
235
236
237
238 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
239 doc.extend(unbalanced_start)
240 if doc and not doc[-1].endswith(' '):
241
242
243 doc[-1] += ' '
244 doc.append('<ins>')
245 if balanced and balanced[-1].endswith(' '):
246
247 balanced[-1] = balanced[-1][:-1]
248 doc.extend(balanced)
249 doc.append('</ins> ')
250 doc.extend(unbalanced_end)
251
252
253
254
259
261 """ Raised when the document no longer contains any pending deletes
262 (DEL_START/DEL_END) """
263
265 """ Adds the text chunks in del_chunks to the document doc (another
266 list of text chunks) with marker to show it is a delete.
267 cleanup_delete later resolves these markers into <del> tags."""
268 doc.append(DEL_START)
269 doc.extend(del_chunks)
270 doc.append(DEL_END)
271
273 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
274 them with <del></del>. To do this while keeping the document
275 valid, it may need to drop some tags (either start or end tags).
276
277 It may also move the del into adjacent tags to try to move it to a
278 similar location where it was originally located (e.g., moving a
279 delete into preceding <div> tag, if the del looks like (DEL_START,
280 'Text</div>', DEL_END)"""
281 while 1:
282
283
284
285 try:
286 pre_delete, delete, post_delete = split_delete(chunks)
287 except NoDeletes:
288
289 break
290
291
292 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
293
294
295 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
296 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
297 doc = pre_delete
298 if doc and not doc[-1].endswith(' '):
299
300 doc[-1] += ' '
301 doc.append('<del>')
302 if balanced and balanced[-1].endswith(' '):
303
304 balanced[-1] = balanced[-1][:-1]
305 doc.extend(balanced)
306 doc.append('</del> ')
307 doc.extend(post_delete)
308 chunks = doc
309 return chunks
310
312 """Return (unbalanced_start, balanced, unbalanced_end), where each is
313 a list of text and tag chunks.
314
315 unbalanced_start is a list of all the tags that are opened, but
316 not closed in this span. Similarly, unbalanced_end is a list of
317 tags that are closed but were not opened. Extracting these might
318 mean some reordering of the chunks."""
319 start = []
320 end = []
321 tag_stack = []
322 balanced = []
323 for chunk in chunks:
324 if not chunk.startswith('<'):
325 balanced.append(chunk)
326 continue
327 endtag = chunk[1] == '/'
328 name = chunk.split()[0].strip('<>/')
329 if name in empty_tags:
330 balanced.append(chunk)
331 continue
332 if endtag:
333 if tag_stack and tag_stack[-1][0] == name:
334 balanced.append(chunk)
335 name, pos, tag = tag_stack.pop()
336 balanced[pos] = tag
337 elif tag_stack:
338 start.extend([tag for name, pos, tag in tag_stack])
339 tag_stack = []
340 end.append(chunk)
341 else:
342 end.append(chunk)
343 else:
344 tag_stack.append((name, len(balanced), chunk))
345 balanced.append(None)
346 start.extend(
347 [chunk for name, pos, chunk in tag_stack])
348 balanced = [chunk for chunk in balanced if chunk is not None]
349 return start, balanced, end
350
352 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
353 stuff_after_DEL_END). Returns the first case found (there may be
354 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
355 there's no DEL_START found. """
356 try:
357 pos = chunks.index(DEL_START)
358 except ValueError:
359 raise NoDeletes
360 pos2 = chunks.index(DEL_END)
361 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
362
364 """ pre_delete and post_delete implicitly point to a place in the
365 document (where the two were split). This moves that point (by
366 popping items from one and pushing them onto the other). It moves
367 the point to try to find a place where unbalanced_start applies.
368
369 As an example::
370
371 >>> unbalanced_start = ['<div>']
372 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
373 >>> pre, post = doc[:3], doc[3:]
374 >>> pre, post
375 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
376 >>> locate_unbalanced_start(unbalanced_start, pre, post)
377 >>> pre, post
378 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
379
380 As you can see, we moved the point so that the dangling <div> that
381 we found will be effectively replaced by the div in the original
382 document. If this doesn't work out, we just throw away
383 unbalanced_start without doing anything.
384 """
385 while 1:
386 if not unbalanced_start:
387
388 break
389 finding = unbalanced_start[0]
390 finding_name = finding.split()[0].strip('<>')
391 if not post_delete:
392 break
393 next = post_delete[0]
394 if next is DEL_START or not next.startswith('<'):
395
396 break
397 if next[1] == '/':
398
399 break
400 name = next.split()[0].strip('<>')
401 if name == 'ins':
402
403 break
404 assert name != 'del', (
405 "Unexpected delete tag: %r" % next)
406 if name == finding_name:
407 unbalanced_start.pop(0)
408 pre_delete.append(post_delete.pop(0))
409 else:
410
411 break
412
414 """ like locate_unbalanced_start, except handling end tags and
415 possibly moving the point earlier in the document. """
416 while 1:
417 if not unbalanced_end:
418
419 break
420 finding = unbalanced_end[-1]
421 finding_name = finding.split()[0].strip('<>/')
422 if not pre_delete:
423 break
424 next = pre_delete[-1]
425 if next is DEL_END or not next.startswith('</'):
426
427 break
428 name = next.split()[0].strip('<>/')
429 if name == 'ins' or name == 'del':
430
431 break
432 if name == finding_name:
433 unbalanced_end.pop()
434 post_delete.insert(0, pre_delete.pop())
435 else:
436
437 break
438
440 """ Represents a diffable token, generally a word that is displayed to
441 the user. Opening tags are attached to this token when they are
442 adjacent (pre_tags) and closing tags that follow the word
443 (post_tags). Some exceptions occur when there are empty tags
444 adjacent to a word, so there may be close tags in pre_tags, or
445 open tags in post_tags.
446
447 We also keep track of whether the word was originally followed by
448 whitespace, even though we do not want to treat the word as
449 equivalent to a similar word that does not have a trailing
450 space."""
451
452
453
454 hide_when_equal = False
455
456 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
457 obj = _unicode.__new__(cls, text)
458
459 if pre_tags is not None:
460 obj.pre_tags = pre_tags
461 else:
462 obj.pre_tags = []
463
464 if post_tags is not None:
465 obj.post_tags = post_tags
466 else:
467 obj.post_tags = []
468
469 obj.trailing_whitespace = trailing_whitespace
470
471 return obj
472
474 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
475 self.post_tags, self.trailing_whitespace)
476
478 return _unicode(self)
479
481
482 """ Represents a token that is actually a tag. Currently this is just
483 the <img> tag, which takes up visible space just like a word but
484 is only represented in a document by a tag. """
485
486 - def __new__(cls, tag, data, html_repr, pre_tags=None,
487 post_tags=None, trailing_whitespace=""):
488 obj = token.__new__(cls, "%s: %s" % (type, data),
489 pre_tags=pre_tags,
490 post_tags=post_tags,
491 trailing_whitespace=trailing_whitespace)
492 obj.tag = tag
493 obj.data = data
494 obj.html_repr = html_repr
495 return obj
496
498 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
499 self.tag,
500 self.data,
501 self.html_repr,
502 self.pre_tags,
503 self.post_tags,
504 self.trailing_whitespace)
506 return self.html_repr
507
509
510 """ Represents the href in an anchor tag. Unlike other words, we only
511 show the href when it changes. """
512
513 hide_when_equal = True
514
516 return ' Link: %s' % self
517
519 """
520 Parse the given HTML and returns token objects (words with attached tags).
521
522 This parses only the content of a page; anything in the head is
523 ignored, and the <head> and <body> elements are themselves
524 optional. The content is then parsed by lxml, which ensures the
525 validity of the resulting parsed document (though lxml may make
526 incorrect guesses when the markup is particular bad).
527
528 <ins> and <del> tags are also eliminated from the document, as
529 that gets confusing.
530
531 If include_hrefs is true, then the href attribute of <a> tags is
532 included as a special kind of diffable token."""
533 if etree.iselement(html):
534 body_el = html
535 else:
536 body_el = parse_html(html, cleanup=True)
537
538 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
539
540 return fixup_chunks(chunks)
541
543 """
544 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
545 wrapped in a <div> tag that was not in the original document.
546
547 If cleanup is true, make sure there's no <head> or <body>, and get
548 rid of any <ins> and <del> tags.
549 """
550 if cleanup:
551
552 html = cleanup_html(html)
553 return fragment_fromstring(html, create_parent=True)
554
555 _body_re = re.compile(r'<body.*?>', re.I|re.S)
556 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
557 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
558
560 """ This 'cleans' the HTML, meaning that any page structure is removed
561 (only the contents of <body> are used, if there is any <body).
562 Also <ins> and <del> tags are removed. """
563 match = _body_re.search(html)
564 if match:
565 html = html[match.end():]
566 match = _end_body_re.search(html)
567 if match:
568 html = html[:match.start()]
569 html = _ins_del_re.sub('', html)
570 return html
571
572
573 end_whitespace_re = re.compile(r'[ \t\n\r]$')
574
576 """
577 This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
578 """
579 stripped_length = len(word.rstrip())
580 return word[0:stripped_length], word[stripped_length:]
581
582
584 """
585 This function takes a list of chunks and produces a list of tokens.
586 """
587 tag_accum = []
588 cur_word = None
589 result = []
590 for chunk in chunks:
591 if isinstance(chunk, tuple):
592 if chunk[0] == 'img':
593 src = chunk[1]
594 tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
595 cur_word = tag_token('img', src, html_repr=tag,
596 pre_tags=tag_accum,
597 trailing_whitespace=trailing_whitespace)
598 tag_accum = []
599 result.append(cur_word)
600
601 elif chunk[0] == 'href':
602 href = chunk[1]
603 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
604 tag_accum = []
605 result.append(cur_word)
606 continue
607
608 if is_word(chunk):
609 chunk, trailing_whitespace = split_trailing_whitespace(chunk)
610 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
611 tag_accum = []
612 result.append(cur_word)
613
614 elif is_start_tag(chunk):
615 tag_accum.append(chunk)
616
617 elif is_end_tag(chunk):
618 if tag_accum:
619 tag_accum.append(chunk)
620 else:
621 assert cur_word, (
622 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
623 % (cur_word, result, chunk, chunks))
624 cur_word.post_tags.append(chunk)
625 else:
626 assert(0)
627
628 if not result:
629 return [token('', pre_tags=tag_accum)]
630 else:
631 result[-1].post_tags.extend(tag_accum)
632
633 return result
634
635
636
637 empty_tags = (
638 'param', 'img', 'area', 'br', 'basefont', 'input',
639 'base', 'meta', 'link', 'col')
640
641 block_level_tags = (
642 'address',
643 'blockquote',
644 'center',
645 'dir',
646 'div',
647 'dl',
648 'fieldset',
649 'form',
650 'h1',
651 'h2',
652 'h3',
653 'h4',
654 'h5',
655 'h6',
656 'hr',
657 'isindex',
658 'menu',
659 'noframes',
660 'noscript',
661 'ol',
662 'p',
663 'pre',
664 'table',
665 'ul',
666 )
667
668 block_level_container_tags = (
669 'dd',
670 'dt',
671 'frameset',
672 'li',
673 'tbody',
674 'td',
675 'tfoot',
676 'th',
677 'thead',
678 'tr',
679 )
680
681
682 -def flatten_el(el, include_hrefs, skip_tag=False):
683 """ Takes an lxml element el, and generates all the text chunks for
684 that tag. Each start tag is a chunk, each word is a chunk, and each
685 end tag is a chunk.
686
687 If skip_tag is true, then the outermost container tag is
688 not returned (just its contents)."""
689 if not skip_tag:
690 if el.tag == 'img':
691 yield ('img', el.get('src'), start_tag(el))
692 else:
693 yield start_tag(el)
694 if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
695 return
696 start_words = split_words(el.text)
697 for word in start_words:
698 yield html_escape(word)
699 for child in el:
700 for item in flatten_el(child, include_hrefs=include_hrefs):
701 yield item
702 if el.tag == 'a' and el.get('href') and include_hrefs:
703 yield ('href', el.get('href'))
704 if not skip_tag:
705 yield end_tag(el)
706 end_words = split_words(el.tail)
707 for word in end_words:
708 yield html_escape(word)
709
710 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
711
713 """ Splits some text into words. Includes trailing whitespace
714 on each word when appropriate. """
715 if not text or not text.strip():
716 return []
717
718 words = split_words_re.findall(text)
719 return words
720
721 start_whitespace_re = re.compile(r'^[ \t\n\r]')
722
724 """
725 The text representation of the start tag for a tag.
726 """
727 return '<%s%s>' % (
728 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
729 for name, value in el.attrib.items()]))
730
732 """ The text representation of an end tag for a tag. Includes
733 trailing whitespace when appropriate. """
734 if el.tail and start_whitespace_re.search(el.tail):
735 extra = ' '
736 else:
737 extra = ''
738 return '</%s>%s' % (el.tag, extra)
739
741 return not tok.startswith('<')
742
744 return tok.startswith('</')
745
747 return tok.startswith('<') and not tok.startswith('</')
748
757
759 """ Serialize a single lxml element as HTML. The serialized form
760 includes the elements tail.
761
762 If skip_outer is true, then don't serialize the outermost tag
763 """
764 assert not isinstance(el, basestring), (
765 "You should pass in an element, not a string like %r" % el)
766 html = etree.tostring(el, method="html", encoding=_unicode)
767 if skip_outer:
768
769 html = html[html.find('>')+1:]
770
771 html = html[:html.rfind('<')]
772 return html.strip()
773 else:
774 return html
775
785
786
796
829
831 """
832 Removes an element, but merges its contents into its place, e.g.,
833 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
834 <p>Hi there!</p>
835 """
836 parent = el.getparent()
837 text = el.text or ''
838 if el.tail:
839 if not len(el):
840 text += el.tail
841 else:
842 if el[-1].tail:
843 el[-1].tail += el.tail
844 else:
845 el[-1].tail = el.tail
846 index = parent.index(el)
847 if text:
848 if index == 0:
849 previous = None
850 else:
851 previous = parent[index-1]
852 if previous is None:
853 if parent.text:
854 parent.text += text
855 else:
856 parent.text = text
857 else:
858 if previous.tail:
859 previous.tail += text
860 else:
861 previous.tail = text
862 parent[index:index+1] = el.getchildren()
863
865 """
866 Acts like SequenceMatcher, but tries not to find very small equal
867 blocks amidst large spans of changes
868 """
869
870 threshold = 2
871
873 size = min(len(self.b), len(self.b))
874 threshold = min(self.threshold, size / 4)
875 actual = difflib.SequenceMatcher.get_matching_blocks(self)
876 return [item for item in actual
877 if item[2] > threshold
878 or not item[2]]
879
880 if __name__ == '__main__':
881 from lxml.html import _diffcommand
882 _diffcommand.main()
883