1 import difflib
2 from lxml import etree
3 from lxml.html import fragment_fromstring
4 import re
5
6 __all__ = ['html_annotate', 'htmldiff']
7
8 try:
9 from html import escape as html_escape
10 except ImportError:
11 from cgi import escape as html_escape
12 try:
13 _unicode = unicode
14 except NameError:
15
16 _unicode = str
17 try:
18 basestring
19 except NameError:
20
21 basestring = str
22
23
24
25
26
28 return '<span title="%s">%s</span>' % (
29 html_escape(_unicode(version), 1), text)
30
32 """
33 doclist should be ordered from oldest to newest, like::
34
35 >>> version1 = 'Hello World'
36 >>> version2 = 'Goodbye World'
37 >>> print(html_annotate([(version1, 'version 1'),
38 ... (version2, 'version 2')]))
39 <span title="version 2">Goodbye</span> <span title="version 1">World</span>
40
41 The documents must be *fragments* (str/UTF8 or unicode), not
42 complete documents
43
44 The markup argument is a function to markup the spans of words.
45 This function is called like markup('Hello', 'version 2'), and
46 returns HTML. The first argument is text and never includes any
47 markup. The default uses a span with a title:
48
49 >>> print(default_markup('Some Text', 'by Joe'))
50 <span title="by Joe">Some Text</span>
51 """
52
53
54
55
56
57 tokenlist = [tokenize_annotated(doc, version)
58 for doc, version in doclist]
59 cur_tokens = tokenlist[0]
60 for tokens in tokenlist[1:]:
61 html_annotate_merge_annotations(cur_tokens, tokens)
62 cur_tokens = tokens
63
64
65
66 cur_tokens = compress_tokens(cur_tokens)
67
68 result = markup_serialize_tokens(cur_tokens, markup)
69 return ''.join(result).strip()
70
72 """Tokenize a document and add an annotation attribute to each token
73 """
74 tokens = tokenize(doc, include_hrefs=False)
75 for tok in tokens:
76 tok.annotation = annotation
77 return tokens
78
80 """Merge the annotations from tokens_old into tokens_new, when the
81 tokens in the new document already existed in the old document.
82 """
83 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
84 commands = s.get_opcodes()
85
86 for command, i1, i2, j1, j2 in commands:
87 if command == 'equal':
88 eq_old = tokens_old[i1:i2]
89 eq_new = tokens_new[j1:j2]
90 copy_annotations(eq_old, eq_new)
91
93 """
94 Copy annotations from the tokens listed in src to the tokens in dest
95 """
96 assert len(src) == len(dest)
97 for src_tok, dest_tok in zip(src, dest):
98 dest_tok.annotation = src_tok.annotation
99
101 """
102 Combine adjacent tokens when there is no HTML between the tokens,
103 and they share an annotation
104 """
105 result = [tokens[0]]
106 for tok in tokens[1:]:
107 if (not result[-1].post_tags and
108 not tok.pre_tags and
109 result[-1].annotation == tok.annotation):
110 compress_merge_back(result, tok)
111 else:
112 result.append(tok)
113 return result
114
116 """ Merge tok into the last element of tokens (modifying the list of
117 tokens in-place). """
118 last = tokens[-1]
119 if type(last) is not token or type(tok) is not token:
120 tokens.append(tok)
121 else:
122 text = _unicode(last)
123 if last.trailing_whitespace:
124 text += ' '
125 text += tok
126 merged = token(text,
127 pre_tags=last.pre_tags,
128 post_tags=tok.post_tags,
129 trailing_whitespace=tok.trailing_whitespace)
130 merged.annotation = last.annotation
131 tokens[-1] = merged
132
134 """
135 Serialize the list of tokens into a list of text chunks, calling
136 markup_func around text to add annotations.
137 """
138 for token in tokens:
139 for pre in token.pre_tags:
140 yield pre
141 html = token.html()
142 html = markup_func(html, token.annotation)
143 if token.trailing_whitespace:
144 html += ' '
145 yield html
146 for post in token.post_tags:
147 yield post
148
149
150
151
152
153
155
156
157 """ Do a diff of the old and new document. The documents are HTML
158 *fragments* (str/UTF8 or unicode), they are not complete documents
159 (i.e., no <html> tag).
160
161 Returns HTML with <ins> and <del> tags added around the
162 appropriate text.
163
164 Markup is generally ignored, with the markup from new_html
165 preserved, and possibly some markup from old_html (though it is
166 considered acceptable to lose some of the old markup). Only the
167 words in the HTML are diffed. The exception is <img> tags, which
168 are treated like words, and the href attribute of <a> tags, which
169 are noted inside the tag itself when there are changes.
170 """
171 old_html_tokens = tokenize(old_html)
172 new_html_tokens = tokenize(new_html)
173 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
174 result = ''.join(result).strip()
175 return fixup_ins_del_tags(result)
176
178 """ Does a diff on the tokens themselves, returning a list of text
179 chunks (not tokens).
180 """
181
182
183
184
185
186
187
188
189
190
191
192
193
194 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
195 commands = s.get_opcodes()
196 result = []
197 for command, i1, i2, j1, j2 in commands:
198 if command == 'equal':
199 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
200 continue
201 if command == 'insert' or command == 'replace':
202 ins_tokens = expand_tokens(html2_tokens[j1:j2])
203 merge_insert(ins_tokens, result)
204 if command == 'delete' or command == 'replace':
205 del_tokens = expand_tokens(html1_tokens[i1:i2])
206 merge_delete(del_tokens, result)
207
208
209
210
211 result = cleanup_delete(result)
212
213 return result
214
216 """Given a list of tokens, return a generator of the chunks of
217 text for the data in the tokens.
218 """
219 for token in tokens:
220 for pre in token.pre_tags:
221 yield pre
222 if not equal or not token.hide_when_equal:
223 if token.trailing_whitespace:
224 yield token.html() + ' '
225 else:
226 yield token.html()
227 for post in token.post_tags:
228 yield post
229
231 """ doc is the already-handled document (as a list of text chunks);
232 here we add <ins>ins_chunks</ins> to the end of that. """
233
234
235
236 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
237 doc.extend(unbalanced_start)
238 if doc and not doc[-1].endswith(' '):
239
240
241 doc[-1] += ' '
242 doc.append('<ins>')
243 if balanced and balanced[-1].endswith(' '):
244
245 balanced[-1] = balanced[-1][:-1]
246 doc.extend(balanced)
247 doc.append('</ins> ')
248 doc.extend(unbalanced_end)
249
250
251
252
257
259 """ Raised when the document no longer contains any pending deletes
260 (DEL_START/DEL_END) """
261
263 """ Adds the text chunks in del_chunks to the document doc (another
264 list of text chunks) with marker to show it is a delete.
265 cleanup_delete later resolves these markers into <del> tags."""
266 doc.append(DEL_START)
267 doc.extend(del_chunks)
268 doc.append(DEL_END)
269
271 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
272 them with <del></del>. To do this while keeping the document
273 valid, it may need to drop some tags (either start or end tags).
274
275 It may also move the del into adjacent tags to try to move it to a
276 similar location where it was originally located (e.g., moving a
277 delete into preceding <div> tag, if the del looks like (DEL_START,
278 'Text</div>', DEL_END)"""
279 while 1:
280
281
282
283 try:
284 pre_delete, delete, post_delete = split_delete(chunks)
285 except NoDeletes:
286
287 break
288
289
290 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
291
292
293 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
294 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
295 doc = pre_delete
296 if doc and not doc[-1].endswith(' '):
297
298 doc[-1] += ' '
299 doc.append('<del>')
300 if balanced and balanced[-1].endswith(' '):
301
302 balanced[-1] = balanced[-1][:-1]
303 doc.extend(balanced)
304 doc.append('</del> ')
305 doc.extend(post_delete)
306 chunks = doc
307 return chunks
308
310 """Return (unbalanced_start, balanced, unbalanced_end), where each is
311 a list of text and tag chunks.
312
313 unbalanced_start is a list of all the tags that are opened, but
314 not closed in this span. Similarly, unbalanced_end is a list of
315 tags that are closed but were not opened. Extracting these might
316 mean some reordering of the chunks."""
317 start = []
318 end = []
319 tag_stack = []
320 balanced = []
321 for chunk in chunks:
322 if not chunk.startswith('<'):
323 balanced.append(chunk)
324 continue
325 endtag = chunk[1] == '/'
326 name = chunk.split()[0].strip('<>/')
327 if name in empty_tags:
328 balanced.append(chunk)
329 continue
330 if endtag:
331 if tag_stack and tag_stack[-1][0] == name:
332 balanced.append(chunk)
333 name, pos, tag = tag_stack.pop()
334 balanced[pos] = tag
335 elif tag_stack:
336 start.extend([tag for name, pos, tag in tag_stack])
337 tag_stack = []
338 end.append(chunk)
339 else:
340 end.append(chunk)
341 else:
342 tag_stack.append((name, len(balanced), chunk))
343 balanced.append(None)
344 start.extend(
345 [chunk for name, pos, chunk in tag_stack])
346 balanced = [chunk for chunk in balanced if chunk is not None]
347 return start, balanced, end
348
350 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
351 stuff_after_DEL_END). Returns the first case found (there may be
352 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
353 there's no DEL_START found. """
354 try:
355 pos = chunks.index(DEL_START)
356 except ValueError:
357 raise NoDeletes
358 pos2 = chunks.index(DEL_END)
359 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
360
362 """ pre_delete and post_delete implicitly point to a place in the
363 document (where the two were split). This moves that point (by
364 popping items from one and pushing them onto the other). It moves
365 the point to try to find a place where unbalanced_start applies.
366
367 As an example::
368
369 >>> unbalanced_start = ['<div>']
370 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
371 >>> pre, post = doc[:3], doc[3:]
372 >>> pre, post
373 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
374 >>> locate_unbalanced_start(unbalanced_start, pre, post)
375 >>> pre, post
376 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
377
378 As you can see, we moved the point so that the dangling <div> that
379 we found will be effectively replaced by the div in the original
380 document. If this doesn't work out, we just throw away
381 unbalanced_start without doing anything.
382 """
383 while 1:
384 if not unbalanced_start:
385
386 break
387 finding = unbalanced_start[0]
388 finding_name = finding.split()[0].strip('<>')
389 if not post_delete:
390 break
391 next = post_delete[0]
392 if next is DEL_START or not next.startswith('<'):
393
394 break
395 if next[1] == '/':
396
397 break
398 name = next.split()[0].strip('<>')
399 if name == 'ins':
400
401 break
402 assert name != 'del', (
403 "Unexpected delete tag: %r" % next)
404 if name == finding_name:
405 unbalanced_start.pop(0)
406 pre_delete.append(post_delete.pop(0))
407 else:
408
409 break
410
412 """ like locate_unbalanced_start, except handling end tags and
413 possibly moving the point earlier in the document. """
414 while 1:
415 if not unbalanced_end:
416
417 break
418 finding = unbalanced_end[-1]
419 finding_name = finding.split()[0].strip('<>/')
420 if not pre_delete:
421 break
422 next = pre_delete[-1]
423 if next is DEL_END or not next.startswith('</'):
424
425 break
426 name = next.split()[0].strip('<>/')
427 if name == 'ins' or name == 'del':
428
429 break
430 if name == finding_name:
431 unbalanced_end.pop()
432 post_delete.insert(0, pre_delete.pop())
433 else:
434
435 break
436
438 """ Represents a diffable token, generally a word that is displayed to
439 the user. Opening tags are attached to this token when they are
440 adjacent (pre_tags) and closing tags that follow the word
441 (post_tags). Some exceptions occur when there are empty tags
442 adjacent to a word, so there may be close tags in pre_tags, or
443 open tags in post_tags.
444
445 We also keep track of whether the word was originally followed by
446 whitespace, even though we do not want to treat the word as
447 equivalent to a similar word that does not have a trailing
448 space."""
449
450
451
452 hide_when_equal = False
453
454 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
455 obj = _unicode.__new__(cls, text)
456
457 if pre_tags is not None:
458 obj.pre_tags = pre_tags
459 else:
460 obj.pre_tags = []
461
462 if post_tags is not None:
463 obj.post_tags = post_tags
464 else:
465 obj.post_tags = []
466
467 obj.trailing_whitespace = trailing_whitespace
468
469 return obj
470
472 return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
473
475 return _unicode(self)
476
478
479 """ Represents a token that is actually a tag. Currently this is just
480 the <img> tag, which takes up visible space just like a word but
481 is only represented in a document by a tag. """
482
483 - def __new__(cls, tag, data, html_repr, pre_tags=None,
484 post_tags=None, trailing_whitespace=False):
485 obj = token.__new__(cls, "%s: %s" % (type, data),
486 pre_tags=pre_tags,
487 post_tags=post_tags,
488 trailing_whitespace=trailing_whitespace)
489 obj.tag = tag
490 obj.data = data
491 obj.html_repr = html_repr
492 return obj
493
495 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
496 self.tag,
497 self.data,
498 self.html_repr,
499 self.pre_tags,
500 self.post_tags,
501 self.trailing_whitespace)
503 return self.html_repr
504
506
507 """ Represents the href in an anchor tag. Unlike other words, we only
508 show the href when it changes. """
509
510 hide_when_equal = True
511
513 return ' Link: %s' % self
514
516 """
517 Parse the given HTML and returns token objects (words with attached tags).
518
519 This parses only the content of a page; anything in the head is
520 ignored, and the <head> and <body> elements are themselves
521 optional. The content is then parsed by lxml, which ensures the
522 validity of the resulting parsed document (though lxml may make
523 incorrect guesses when the markup is particular bad).
524
525 <ins> and <del> tags are also eliminated from the document, as
526 that gets confusing.
527
528 If include_hrefs is true, then the href attribute of <a> tags is
529 included as a special kind of diffable token."""
530 if etree.iselement(html):
531 body_el = html
532 else:
533 body_el = parse_html(html, cleanup=True)
534
535 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
536
537 return fixup_chunks(chunks)
538
540 """
541 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
542 wrapped in a <div> tag that was not in the original document.
543
544 If cleanup is true, make sure there's no <head> or <body>, and get
545 rid of any <ins> and <del> tags.
546 """
547 if cleanup:
548
549 html = cleanup_html(html)
550 return fragment_fromstring(html, create_parent=True)
551
552 _body_re = re.compile(r'<body.*?>', re.I|re.S)
553 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
554 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
555
557 """ This 'cleans' the HTML, meaning that any page structure is removed
558 (only the contents of <body> are used, if there is any <body).
559 Also <ins> and <del> tags are removed. """
560 match = _body_re.search(html)
561 if match:
562 html = html[match.end():]
563 match = _end_body_re.search(html)
564 if match:
565 html = html[:match.start()]
566 html = _ins_del_re.sub('', html)
567 return html
568
569
570 end_whitespace_re = re.compile(r'[ \t\n\r]$')
571
573 """
574 This function takes a list of chunks and produces a list of tokens.
575 """
576 tag_accum = []
577 cur_word = None
578 result = []
579 for chunk in chunks:
580 if isinstance(chunk, tuple):
581 if chunk[0] == 'img':
582 src = chunk[1]
583 tag = chunk[2]
584 if tag.endswith(' '):
585 tag = tag[:-1]
586 trailing_whitespace = True
587 else:
588 trailing_whitespace = False
589 cur_word = tag_token('img', src, html_repr=tag,
590 pre_tags=tag_accum,
591 trailing_whitespace=trailing_whitespace)
592 tag_accum = []
593 result.append(cur_word)
594 elif chunk[0] == 'href':
595 href = chunk[1]
596 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
597 tag_accum = []
598 result.append(cur_word)
599 continue
600 if is_word(chunk):
601 if chunk.endswith(' '):
602 chunk = chunk[:-1]
603 trailing_whitespace = True
604 else:
605 trailing_whitespace = False
606 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
607 tag_accum = []
608 result.append(cur_word)
609 elif is_start_tag(chunk):
610 tag_accum.append(chunk)
611 elif is_end_tag(chunk):
612 if tag_accum:
613 tag_accum.append(chunk)
614 else:
615 assert cur_word, (
616 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
617 % (cur_word, result, chunk, chunks))
618 cur_word.post_tags.append(chunk)
619 else:
620 assert(0)
621
622 if not result:
623 return [token('', pre_tags=tag_accum)]
624 else:
625 result[-1].post_tags.extend(tag_accum)
626
627 return result
628
629
630
631 empty_tags = (
632 'param', 'img', 'area', 'br', 'basefont', 'input',
633 'base', 'meta', 'link', 'col')
634
635 block_level_tags = (
636 'address',
637 'blockquote',
638 'center',
639 'dir',
640 'div',
641 'dl',
642 'fieldset',
643 'form',
644 'h1',
645 'h2',
646 'h3',
647 'h4',
648 'h5',
649 'h6',
650 'hr',
651 'isindex',
652 'menu',
653 'noframes',
654 'noscript',
655 'ol',
656 'p',
657 'pre',
658 'table',
659 'ul',
660 )
661
662 block_level_container_tags = (
663 'dd',
664 'dt',
665 'frameset',
666 'li',
667 'tbody',
668 'td',
669 'tfoot',
670 'th',
671 'thead',
672 'tr',
673 )
674
675
676 -def flatten_el(el, include_hrefs, skip_tag=False):
677 """ Takes an lxml element el, and generates all the text chunks for
678 that tag. Each start tag is a chunk, each word is a chunk, and each
679 end tag is a chunk.
680
681 If skip_tag is true, then the outermost container tag is
682 not returned (just its contents)."""
683 if not skip_tag:
684 if el.tag == 'img':
685 yield ('img', el.get('src'), start_tag(el))
686 else:
687 yield start_tag(el)
688 if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
689 return
690 start_words = split_words(el.text)
691 for word in start_words:
692 yield html_escape(word)
693 for child in el:
694 for item in flatten_el(child, include_hrefs=include_hrefs):
695 yield item
696 if el.tag == 'a' and el.get('href') and include_hrefs:
697 yield ('href', el.get('href'))
698 if not skip_tag:
699 yield end_tag(el)
700 end_words = split_words(el.tail)
701 for word in end_words:
702 yield html_escape(word)
703
705 """ Splits some text into words. Includes trailing whitespace (one
706 space) on each word when appropriate. """
707 if not text or not text.strip():
708 return []
709 words = [w + ' ' for w in text.strip().split()]
710 if not end_whitespace_re.search(text):
711 words[-1] = words[-1][:-1]
712 return words
713
714 start_whitespace_re = re.compile(r'^[ \t\n\r]')
715
717 """
718 The text representation of the start tag for a tag.
719 """
720 return '<%s%s>' % (
721 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
722 for name, value in el.attrib.items()]))
723
725 """ The text representation of an end tag for a tag. Includes
726 trailing whitespace when appropriate. """
727 if el.tail and start_whitespace_re.search(el.tail):
728 extra = ' '
729 else:
730 extra = ''
731 return '</%s>%s' % (el.tag, extra)
732
734 return not tok.startswith('<')
735
737 return tok.startswith('</')
738
740 return tok.startswith('<') and not tok.startswith('</')
741
750
752 """ Serialize a single lxml element as HTML. The serialized form
753 includes the elements tail.
754
755 If skip_outer is true, then don't serialize the outermost tag
756 """
757 assert not isinstance(el, basestring), (
758 "You should pass in an element, not a string like %r" % el)
759 html = etree.tostring(el, method="html", encoding=_unicode)
760 if skip_outer:
761
762 html = html[html.find('>')+1:]
763
764 html = html[:html.rfind('<')]
765 return html.strip()
766 else:
767 return html
768
778
779
789
822
824 """
825 Removes an element, but merges its contents into its place, e.g.,
826 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
827 <p>Hi there!</p>
828 """
829 parent = el.getparent()
830 text = el.text or ''
831 if el.tail:
832 if not len(el):
833 text += el.tail
834 else:
835 if el[-1].tail:
836 el[-1].tail += el.tail
837 else:
838 el[-1].tail = el.tail
839 index = parent.index(el)
840 if text:
841 if index == 0:
842 previous = None
843 else:
844 previous = parent[index-1]
845 if previous is None:
846 if parent.text:
847 parent.text += text
848 else:
849 parent.text = text
850 else:
851 if previous.tail:
852 previous.tail += text
853 else:
854 previous.tail = text
855 parent[index:index+1] = el.getchildren()
856
858 """
859 Acts like SequenceMatcher, but tries not to find very small equal
860 blocks amidst large spans of changes
861 """
862
863 threshold = 2
864
866 size = min(len(self.b), len(self.b))
867 threshold = min(self.threshold, size / 4)
868 actual = difflib.SequenceMatcher.get_matching_blocks(self)
869 return [item for item in actual
870 if item[2] > threshold
871 or not item[2]]
872
873 if __name__ == '__main__':
874 from lxml.html import _diffcommand
875 _diffcommand.main()
876