Package lxml :: Package html :: Module diff
Source Code for Module lxml.html.diff

  1  # cython: language_level=3 
  2   
  3  from __future__ import absolute_import 
  4   
  5  import difflib 
  6  from lxml import etree 
  7  from lxml.html import fragment_fromstring 
  8  import re 
  9   
 10  __all__ = ['html_annotate', 'htmldiff'] 
 11   
 12  try: 
 13      from html import escape as html_escape 
 14  except ImportError: 
 15      from cgi import escape as html_escape 
 16  try: 
 17      _unicode = unicode 
 18  except NameError: 
 19      # Python 3 
 20      _unicode = str 
 21  try: 
 22      basestring 
 23  except NameError: 
 24      # Python 3 
 25      basestring = str 
 26   
 27  ############################################################ 
 28  ## Annotation 
 29  ############################################################ 
 30   
 31 -def default_markup(text, version): 
 32      return '<span title="%s">%s</span>' % ( 
 33          html_escape(_unicode(version), 1), text) 
 34   
 35 -def html_annotate(doclist, markup=default_markup): 
 36      """ 
 37      doclist should be ordered from oldest to newest, like:: 
 38   
 39          >>> version1 = 'Hello World' 
 40          >>> version2 = 'Goodbye World' 
 41          >>> print(html_annotate([(version1, 'version 1'), 
 42          ...                      (version2, 'version 2')])) 
 43          <span title="version 2">Goodbye</span> <span title="version 1">World</span> 
 44   
 45      The documents must be *fragments* (str/UTF8 or unicode), not 
 46      complete documents 
 47   
 48      The markup argument is a function to markup the spans of words. 
 49      This function is called like markup('Hello', 'version 2'), and 
 50      returns HTML.  The first argument is text and never includes any 
 51      markup.  The default uses a span with a title: 
 52   
 53          >>> print(default_markup('Some Text', 'by Joe')) 
 54          <span title="by Joe">Some Text</span> 
 55      """ 
 56      # The basic strategy we have is to split the documents up into 
 57      # logical tokens (which are words with attached markup).  We then 
 58      # do diffs of each of the versions to track when a token first 
 59      # appeared in the document; the annotation attached to the token 
 60      # is the version where it first appeared. 
 61      tokenlist = [tokenize_annotated(doc, version) 
 62                   for doc, version in doclist] 
 63      cur_tokens = tokenlist[0] 
 64      for tokens in tokenlist[1:]: 
 65          html_annotate_merge_annotations(cur_tokens, tokens) 
 66          cur_tokens = tokens 
 67   
 68      # After we've tracked all the tokens, we can combine spans of text 
 69      # that are adjacent and have the same annotation 
 70      cur_tokens = compress_tokens(cur_tokens) 
 71      # And finally add markup 
 72      result = markup_serialize_tokens(cur_tokens, markup) 
 73      return ''.join(result).strip() 
 74   
 75 -def tokenize_annotated(doc, annotation):  
 76      """Tokenize a document and add an annotation attribute to each token 
 77      """ 
 78      tokens = tokenize(doc, include_hrefs=False) 
 79      for tok in tokens:  
 80          tok.annotation = annotation 
 81      return tokens 
 82   
 83 -def html_annotate_merge_annotations(tokens_old, tokens_new):  
 84      """Merge the annotations from tokens_old into tokens_new, when the 
 85      tokens in the new document already existed in the old document. 
 86      """ 
 87      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 88      commands = s.get_opcodes() 
 89   
 90      for command, i1, i2, j1, j2 in commands: 
 91          if command == 'equal':  
 92              eq_old = tokens_old[i1:i2] 
 93              eq_new = tokens_new[j1:j2] 
 94              copy_annotations(eq_old, eq_new) 
 95   
 96 -def copy_annotations(src, dest):  
 97      """ 
 98      Copy annotations from the tokens listed in src to the tokens in dest 
 99      """ 
100      assert len(src) == len(dest) 
101      for src_tok, dest_tok in zip(src, dest):  
102          dest_tok.annotation = src_tok.annotation 
103   
104 -def compress_tokens(tokens): 
105      """ 
106      Combine adjacent tokens when there is no HTML between the tokens,  
107      and they share an annotation 
108      """ 
109      result = [tokens[0]]  
110      for tok in tokens[1:]:  
111          if (not result[-1].post_tags and  
112              not tok.pre_tags and  
113              result[-1].annotation == tok.annotation):  
114              compress_merge_back(result, tok) 
115          else:  
116              result.append(tok) 
117      return result 
118   
119 -def compress_merge_back(tokens, tok):  
120      """ Merge tok into the last element of tokens (modifying the list of 
121      tokens in-place).  """ 
122      last = tokens[-1] 
123      if type(last) is not token or type(tok) is not token:  
124          tokens.append(tok) 
125      else: 
126          text = _unicode(last) 
127          if last.trailing_whitespace: 
128              text += last.trailing_whitespace 
129          text += tok 
130          merged = token(text, 
131                         pre_tags=last.pre_tags, 
132                         post_tags=tok.post_tags, 
133                         trailing_whitespace=tok.trailing_whitespace) 
134          merged.annotation = last.annotation 
135          tokens[-1] = merged 
136       
137 -def markup_serialize_tokens(tokens, markup_func): 
138      """ 
139      Serialize the list of tokens into a list of text chunks, calling 
140      markup_func around text to add annotations. 
141      """ 
142      for token in tokens: 
143          for pre in token.pre_tags: 
144              yield pre 
145          html = token.html() 
146          html = markup_func(html, token.annotation) 
147          if token.trailing_whitespace: 
148              html += token.trailing_whitespace 
149          yield html 
150          for post in token.post_tags: 
151              yield post 
152   
153   
154  ############################################################ 
155  ## HTML Diffs 
156  ############################################################ 
157   
158 -def htmldiff(old_html, new_html): 
159      ## FIXME: this should take parsed documents too, and use their body 
160      ## or other content. 
161      """ Do a diff of the old and new document.  The documents are HTML 
162      *fragments* (str/UTF8 or unicode), they are not complete documents 
163      (i.e., no <html> tag). 
164   
165      Returns HTML with <ins> and <del> tags added around the 
166      appropriate text.   
167   
168      Markup is generally ignored, with the markup from new_html 
169      preserved, and possibly some markup from old_html (though it is 
170      considered acceptable to lose some of the old markup).  Only the 
171      words in the HTML are diffed.  The exception is <img> tags, which 
172      are treated like words, and the href attribute of <a> tags, which 
173      are noted inside the tag itself when there are changes. 
174      """  
175      old_html_tokens = tokenize(old_html) 
176      new_html_tokens = tokenize(new_html) 
177      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
178      result = ''.join(result).strip() 
179      return fixup_ins_del_tags(result) 
180   
181 -def htmldiff_tokens(html1_tokens, html2_tokens): 
182      """ Does a diff on the tokens themselves, returning a list of text 
183      chunks (not tokens). 
184      """ 
185      # There are several passes as we do the differences.  The tokens 
186      # isolate the portion of the content we care to diff; difflib does 
187      # all the actual hard work at that point.   
188      # 
189      # Then we must create a valid document from pieces of both the old 
190      # document and the new document.  We generally prefer to take 
191      # markup from the new document, and only do a best effort attempt 
192      # to keep markup from the old document; anything that we can't 
193      # resolve we throw away.  Also we try to put the deletes as close 
194      # to the location where we think they would have been -- because 
195      # we are only keeping the markup from the new document, it can be 
196      # fuzzy where in the new document the old text would have gone. 
197      # Again we just do a best effort attempt. 
198      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
199      commands = s.get_opcodes() 
200      result = [] 
201      for command, i1, i2, j1, j2 in commands: 
202          if command == 'equal': 
203              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
204              continue 
205          if command == 'insert' or command == 'replace': 
206              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
207              merge_insert(ins_tokens, result) 
208          if command == 'delete' or command == 'replace': 
209              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
210              merge_delete(del_tokens, result) 
211      # If deletes were inserted directly as <del> then we'd have an 
212      # invalid document at this point.  Instead we put in special 
213      # markers, and when the complete diffed document has been created 
214      # we try to move the deletes around and resolve any problems. 
215      result = cleanup_delete(result) 
216   
217      return result 
218   
219 -def expand_tokens(tokens, equal=False): 
220      """Given a list of tokens, return a generator of the chunks of 
221      text for the data in the tokens. 
222      """ 
223      for token in tokens: 
224          for pre in token.pre_tags: 
225              yield pre 
226          if not equal or not token.hide_when_equal: 
227              if token.trailing_whitespace: 
228                  yield token.html() + token.trailing_whitespace 
229              else: 
230                  yield token.html() 
231          for post in token.post_tags: 
232              yield post 
233   
234 -def merge_insert(ins_chunks, doc): 
235      """ doc is the already-handled document (as a list of text chunks); 
236      here we add <ins>ins_chunks</ins> to the end of that.  """ 
237      # Though we don't throw away unbalanced_start or unbalanced_end 
238      # (we assume there is accompanying markup later or earlier in the 
239      # document), we only put <ins> around the balanced portion. 
240      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
241      doc.extend(unbalanced_start) 
242      if doc and not doc[-1].endswith(' '): 
243          # Fix up the case where the word before the insert didn't end with  
244          # a space 
245          doc[-1] += ' ' 
246      doc.append('<ins>') 
247      if balanced and balanced[-1].endswith(' '): 
248          # We move space outside of </ins> 
249          balanced[-1] = balanced[-1][:-1] 
250      doc.extend(balanced) 
251      doc.append('</ins> ') 
252      doc.extend(unbalanced_end) 
253   
254  # These are sentinals to represent the start and end of a <del> 
255  # segment, until we do the cleanup phase to turn them into proper 
256  # markup: 
257 -class DEL_START: 
258      pass 
259 -class DEL_END: 
260      pass 
261   
262 -class NoDeletes(Exception): 
263      """ Raised when the document no longer contains any pending deletes 
264      (DEL_START/DEL_END) """ 
265   
266 -def merge_delete(del_chunks, doc): 
267      """ Adds the text chunks in del_chunks to the document doc (another 
268      list of text chunks) with marker to show it is a delete. 
269      cleanup_delete later resolves these markers into <del> tags.""" 
270      doc.append(DEL_START) 
271      doc.extend(del_chunks) 
272      doc.append(DEL_END) 
273   
274 -def cleanup_delete(chunks): 
275      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
276      them with <del></del>.  To do this while keeping the document 
277      valid, it may need to drop some tags (either start or end tags). 
278   
279      It may also move the del into adjacent tags to try to move it to a 
280      similar location where it was originally located (e.g., moving a 
281      delete into preceding <div> tag, if the del looks like (DEL_START, 
282      'Text</div>', DEL_END)""" 
283      while 1: 
284          # Find a pending DEL_START/DEL_END, splitting the document 
285          # into stuff-preceding-DEL_START, stuff-inside, and 
286          # stuff-following-DEL_END 
287          try: 
288              pre_delete, delete, post_delete = split_delete(chunks) 
289          except NoDeletes: 
290              # Nothing found, we've cleaned up the entire doc 
291              break 
292          # The stuff-inside-DEL_START/END may not be well balanced 
293          # markup.  First we figure out what unbalanced portions there are: 
294          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
295          # Then we move the span forward and/or backward based on these 
296          # unbalanced portions: 
297          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
298          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
299          doc = pre_delete 
300          if doc and not doc[-1].endswith(' '): 
301              # Fix up case where the word before us didn't have a trailing space 
302              doc[-1] += ' ' 
303          doc.append('<del>') 
304          if balanced and balanced[-1].endswith(' '): 
305              # We move space outside of </del> 
306              balanced[-1] = balanced[-1][:-1] 
307          doc.extend(balanced) 
308          doc.append('</del> ') 
309          doc.extend(post_delete) 
310          chunks = doc 
311      return chunks 
312   
313 -def split_unbalanced(chunks): 
314      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
315      a list of text and tag chunks. 
316   
317      unbalanced_start is a list of all the tags that are opened, but 
318      not closed in this span.  Similarly, unbalanced_end is a list of 
319      tags that are closed but were not opened.  Extracting these might 
320      mean some reordering of the chunks.""" 
321      start = [] 
322      end = [] 
323      tag_stack = [] 
324      balanced = [] 
325      for chunk in chunks: 
326          if not chunk.startswith('<'): 
327              balanced.append(chunk) 
328              continue 
329          endtag = chunk[1] == '/' 
330          name = chunk.split()[0].strip('<>/') 
331          if name in empty_tags: 
332              balanced.append(chunk) 
333              continue 
334          if endtag: 
335              if tag_stack and tag_stack[-1][0] == name: 
336                  balanced.append(chunk) 
337                  name, pos, tag = tag_stack.pop() 
338                  balanced[pos] = tag 
339              elif tag_stack: 
340                  start.extend([tag for name, pos, tag in tag_stack]) 
341                  tag_stack = [] 
342                  end.append(chunk) 
343              else: 
344                  end.append(chunk) 
345          else: 
346              tag_stack.append((name, len(balanced), chunk)) 
347              balanced.append(None) 
348      start.extend( 
349          [chunk for name, pos, chunk in tag_stack]) 
350      balanced = [chunk for chunk in balanced if chunk is not None] 
351      return start, balanced, end 
352   
353 -def split_delete(chunks): 
354      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
355      stuff_after_DEL_END).  Returns the first case found (there may be 
356      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
357      there's no DEL_START found. """ 
358      try: 
359          pos = chunks.index(DEL_START) 
360      except ValueError: 
361          raise NoDeletes 
362      pos2 = chunks.index(DEL_END) 
363      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
364   
365 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): 
366      """ pre_delete and post_delete implicitly point to a place in the 
367      document (where the two were split).  This moves that point (by 
368      popping items from one and pushing them onto the other).  It moves 
369      the point to try to find a place where unbalanced_start applies. 
370   
371      As an example:: 
372   
373          >>> unbalanced_start = ['<div>'] 
374          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
375          >>> pre, post = doc[:3], doc[3:] 
376          >>> pre, post 
377          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
378          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
379          >>> pre, post 
380          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
381   
382      As you can see, we moved the point so that the dangling <div> that 
383      we found will be effectively replaced by the div in the original 
384      document.  If this doesn't work out, we just throw away 
385      unbalanced_start without doing anything. 
386      """ 
387      while 1: 
388          if not unbalanced_start: 
389              # We have totally succeeded in finding the position 
390              break 
391          finding = unbalanced_start[0] 
392          finding_name = finding.split()[0].strip('<>') 
393          if not post_delete: 
394              break 
395          next = post_delete[0] 
396          if next is DEL_START or not next.startswith('<'): 
397              # Reached a word, we can't move the delete text forward 
398              break 
399          if next[1] == '/': 
400              # Reached a closing tag, can we go further?  Maybe not... 
401              break 
402          name = next.split()[0].strip('<>') 
403          if name == 'ins': 
404              # Can't move into an insert 
405              break 
406          assert name != 'del', ( 
407              "Unexpected delete tag: %r" % next) 
408          if name == finding_name: 
409              unbalanced_start.pop(0) 
410              pre_delete.append(post_delete.pop(0)) 
411          else: 
412              # Found a tag that doesn't match 
413              break 
414   
415 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): 
416      """ like locate_unbalanced_start, except handling end tags and 
417      possibly moving the point earlier in the document.  """ 
418      while 1: 
419          if not unbalanced_end: 
420              # Success 
421              break 
422          finding = unbalanced_end[-1] 
423          finding_name = finding.split()[0].strip('<>/') 
424          if not pre_delete: 
425              break 
426          next = pre_delete[-1] 
427          if next is DEL_END or not next.startswith('</'): 
428              # A word or a start tag 
429              break 
430          name = next.split()[0].strip('<>/') 
431          if name == 'ins' or name == 'del': 
432              # Can't move into an insert or delete 
433              break 
434          if name == finding_name: 
435              unbalanced_end.pop() 
436              post_delete.insert(0, pre_delete.pop()) 
437          else: 
438              # Found a tag that doesn't match 
439              break 
440   
441 -class token(_unicode): 
442      """ Represents a diffable token, generally a word that is displayed to 
443      the user.  Opening tags are attached to this token when they are 
444      adjacent (pre_tags) and closing tags that follow the word 
445      (post_tags).  Some exceptions occur when there are empty tags 
446      adjacent to a word, so there may be close tags in pre_tags, or 
447      open tags in post_tags. 
448   
449      We also keep track of whether the word was originally followed by 
450      whitespace, even though we do not want to treat the word as 
451      equivalent to a similar word that does not have a trailing 
452      space.""" 
453   
454      # When this is true, the token will be eliminated from the 
455      # displayed diff if no change has occurred: 
456      hide_when_equal = False 
457   
458 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): 
459          obj = _unicode.__new__(cls, text) 
460   
461          if pre_tags is not None: 
462              obj.pre_tags = pre_tags 
463          else: 
464              obj.pre_tags = [] 
465   
466          if post_tags is not None: 
467              obj.post_tags = post_tags 
468          else: 
469              obj.post_tags = [] 
470   
471          obj.trailing_whitespace = trailing_whitespace 
472   
473          return obj 
474   
475 -    def __repr__(self): 
476          return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 
477                                            self.post_tags, self.trailing_whitespace) 
478   
479 -    def html(self): 
480          return _unicode(self) 
481   
482 -class tag_token(token): 
483   
484      """ Represents a token that is actually a tag.  Currently this is just 
485      the <img> tag, which takes up visible space just like a word but 
486      is only represented in a document by a tag.  """ 
487   
488 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
489                  post_tags=None, trailing_whitespace=""): 
490          obj = token.__new__(cls, "%s: %s" % (type, data),  
491                              pre_tags=pre_tags,  
492                              post_tags=post_tags,  
493                              trailing_whitespace=trailing_whitespace) 
494          obj.tag = tag 
495          obj.data = data 
496          obj.html_repr = html_repr 
497          return obj 
498   
499 -    def __repr__(self): 
500          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 
501              self.tag,  
502              self.data,  
503              self.html_repr,  
504              self.pre_tags,  
505              self.post_tags,  
506              self.trailing_whitespace) 
507 -    def html(self): 
508          return self.html_repr 
509   
510 -class href_token(token): 
511   
512      """ Represents the href in an anchor tag.  Unlike other words, we only 
513      show the href when it changes.  """ 
514   
515      hide_when_equal = True 
516   
517 -    def html(self): 
518          return ' Link: %s' % self 
519   
520 -def tokenize(html, include_hrefs=True): 
521      """ 
522      Parse the given HTML and returns token objects (words with attached tags). 
523   
524      This parses only the content of a page; anything in the head is 
525      ignored, and the <head> and <body> elements are themselves 
526      optional.  The content is then parsed by lxml, which ensures the 
527      validity of the resulting parsed document (though lxml may make 
528      incorrect guesses when the markup is particular bad). 
529   
530      <ins> and <del> tags are also eliminated from the document, as 
531      that gets confusing. 
532   
533      If include_hrefs is true, then the href attribute of <a> tags is 
534      included as a special kind of diffable token.""" 
535      if etree.iselement(html): 
536          body_el = html 
537      else: 
538          body_el = parse_html(html, cleanup=True) 
539      # Then we split the document into text chunks for each tag, word, and end tag: 
540      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
541      # Finally re-joining them into token objects: 
542      return fixup_chunks(chunks) 
543   
544 -def parse_html(html, cleanup=True): 
545      """ 
546      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
547      wrapped in a <div> tag that was not in the original document. 
548   
549      If cleanup is true, make sure there's no <head> or <body>, and get 
550      rid of any <ins> and <del> tags. 
551      """ 
552      if cleanup: 
553          # This removes any extra markup or structure like <head>: 
554          html = cleanup_html(html) 
555      return fragment_fromstring(html, create_parent=True) 
556   
557  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
558  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
559  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
560   
561 -def cleanup_html(html): 
562      """ This 'cleans' the HTML, meaning that any page structure is removed 
563      (only the contents of <body> are used, if there is any <body). 
564      Also <ins> and <del> tags are removed.  """ 
565      match = _body_re.search(html) 
566      if match: 
567          html = html[match.end():] 
568      match = _end_body_re.search(html) 
569      if match: 
570          html = html[:match.start()] 
571      html = _ins_del_re.sub('', html) 
572      return html 
573       
574   
575  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
576   
577 -def split_trailing_whitespace(word): 
578      """ 
579      This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 
580      """ 
581      stripped_length = len(word.rstrip()) 
582      return word[0:stripped_length], word[stripped_length:] 
583   
584   
585 -def fixup_chunks(chunks): 
586      """ 
587      This function takes a list of chunks and produces a list of tokens. 
588      """ 
589      tag_accum = [] 
590      cur_word = None 
591      result = [] 
592      for chunk in chunks: 
593          if isinstance(chunk, tuple): 
594              if chunk[0] == 'img': 
595                  src = chunk[1] 
596                  tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 
597                  cur_word = tag_token('img', src, html_repr=tag, 
598                                       pre_tags=tag_accum, 
599                                       trailing_whitespace=trailing_whitespace) 
600                  tag_accum = [] 
601                  result.append(cur_word) 
602   
603              elif chunk[0] == 'href': 
604                  href = chunk[1] 
605                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 
606                  tag_accum = [] 
607                  result.append(cur_word) 
608              continue 
609   
610          if is_word(chunk): 
611              chunk, trailing_whitespace = split_trailing_whitespace(chunk) 
612              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
613              tag_accum = [] 
614              result.append(cur_word) 
615   
616          elif is_start_tag(chunk): 
617              tag_accum.append(chunk) 
618   
619          elif is_end_tag(chunk): 
620              if tag_accum: 
621                  tag_accum.append(chunk) 
622              else: 
623                  assert cur_word, ( 
624                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
625                      % (cur_word, result, chunk, chunks)) 
626                  cur_word.post_tags.append(chunk) 
627          else: 
628              assert False 
629   
630      if not result: 
631          return [token('', pre_tags=tag_accum)] 
632      else: 
633          result[-1].post_tags.extend(tag_accum) 
634   
635      return result 
636   
637   
638  # All the tags in HTML that don't require end tags: 
639  empty_tags = ( 
640      'param', 'img', 'area', 'br', 'basefont', 'input', 
641      'base', 'meta', 'link', 'col') 
642   
643  block_level_tags = ( 
644      'address', 
645      'blockquote', 
646      'center', 
647      'dir', 
648      'div', 
649      'dl', 
650      'fieldset', 
651      'form', 
652      'h1', 
653      'h2', 
654      'h3', 
655      'h4', 
656      'h5', 
657      'h6', 
658      'hr', 
659      'isindex', 
660      'menu', 
661      'noframes', 
662      'noscript', 
663      'ol', 
664      'p', 
665      'pre', 
666      'table', 
667      'ul', 
668      ) 
669   
670  block_level_container_tags = ( 
671      'dd', 
672      'dt', 
673      'frameset', 
674      'li', 
675      'tbody', 
676      'td', 
677      'tfoot', 
678      'th', 
679      'thead', 
680      'tr', 
681      ) 
682   
683   
684 -def flatten_el(el, include_hrefs, skip_tag=False): 
685      """ Takes an lxml element el, and generates all the text chunks for 
686      that tag.  Each start tag is a chunk, each word is a chunk, and each 
687      end tag is a chunk. 
688   
689      If skip_tag is true, then the outermost container tag is 
690      not returned (just its contents).""" 
691      if not skip_tag: 
692          if el.tag == 'img': 
693              yield ('img', el.get('src'), start_tag(el)) 
694          else: 
695              yield start_tag(el) 
696      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
697          return 
698      start_words = split_words(el.text) 
699      for word in start_words: 
700          yield html_escape(word) 
701      for child in el: 
702          for item in flatten_el(child, include_hrefs=include_hrefs): 
703              yield item 
704      if el.tag == 'a' and el.get('href') and include_hrefs: 
705          yield ('href', el.get('href')) 
706      if not skip_tag: 
707          yield end_tag(el) 
708          end_words = split_words(el.tail) 
709          for word in end_words: 
710              yield html_escape(word) 
711   
712  split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 
713   
714 -def split_words(text): 
715      """ Splits some text into words. Includes trailing whitespace 
716      on each word when appropriate.  """ 
717      if not text or not text.strip(): 
718          return [] 
719   
720      words = split_words_re.findall(text) 
721      return words 
722   
723  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
724   
725 -def start_tag(el): 
726      """ 
727      The text representation of the start tag for a tag. 
728      """ 
729      return '<%s%s>' % ( 
730          el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 
731                           for name, value in el.attrib.items()])) 
732   
733 -def end_tag(el): 
734      """ The text representation of an end tag for a tag.  Includes 
735      trailing whitespace when appropriate.  """ 
736      if el.tail and start_whitespace_re.search(el.tail): 
737          extra = ' ' 
738      else: 
739          extra = '' 
740      return '</%s>%s' % (el.tag, extra) 
741   
742 -def is_word(tok): 
743      return not tok.startswith('<') 
744   
745 -def is_end_tag(tok): 
746      return tok.startswith('</') 
747   
748 -def is_start_tag(tok): 
749      return tok.startswith('<') and not tok.startswith('</') 
750   
751 -def fixup_ins_del_tags(html): 
752      """ Given an html string, move any <ins> or <del> tags inside of any 
753      block-level elements, e.g. transform <ins><p>word</p></ins> to 
754      <p><ins>word</ins></p> """ 
755      doc = parse_html(html, cleanup=False) 
756      _fixup_ins_del_tags(doc) 
757      html = serialize_html_fragment(doc, skip_outer=True) 
758      return html 
759   
760 -def serialize_html_fragment(el, skip_outer=False): 
761      """ Serialize a single lxml element as HTML.  The serialized form 
762      includes the elements tail.   
763   
764      If skip_outer is true, then don't serialize the outermost tag 
765      """ 
766      assert not isinstance(el, basestring), ( 
767          "You should pass in an element, not a string like %r" % el) 
768      html = etree.tostring(el, method="html", encoding=_unicode) 
769      if skip_outer: 
770          # Get rid of the extra starting tag: 
771          html = html[html.find('>')+1:] 
772          # Get rid of the extra end tag: 
773          html = html[:html.rfind('<')] 
774          return html.strip() 
775      else: 
776          return html 
777   
778 -def _fixup_ins_del_tags(doc): 
779      """fixup_ins_del_tags that works on an lxml document in-place 
780      """ 
781      for tag in ['ins', 'del']: 
782          for el in doc.xpath('descendant-or-self::%s' % tag): 
783              if not _contains_block_level_tag(el): 
784                  continue 
785              _move_el_inside_block(el, tag=tag) 
786              el.drop_tag() 
787              #_merge_element_contents(el) 
788   
789 -def _contains_block_level_tag(el): 
790      """True if the element contains any block-level elements, like <p>, <td>, etc. 
791      """ 
792      if el.tag in block_level_tags or el.tag in block_level_container_tags: 
793          return True 
794      for child in el: 
795          if _contains_block_level_tag(child): 
796              return True 
797      return False 
798   
799 -def _move_el_inside_block(el, tag): 
800      """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 
801      and moves them inside any block-level tags.  """ 
802      for child in el: 
803          if _contains_block_level_tag(child): 
804              break 
805      else: 
806          # No block-level tags in any child 
807          children_tag = etree.Element(tag) 
808          children_tag.text = el.text 
809          el.text = None 
810          children_tag.extend(list(el)) 
811          el[:] = [children_tag] 
812          return 
813      for child in list(el): 
814          if _contains_block_level_tag(child): 
815              _move_el_inside_block(child, tag) 
816              if child.tail: 
817                  tail_tag = etree.Element(tag) 
818                  tail_tag.text = child.tail 
819                  child.tail = None 
820                  el.insert(el.index(child)+1, tail_tag) 
821          else: 
822              child_tag = etree.Element(tag) 
823              el.replace(child, child_tag) 
824              child_tag.append(child) 
825      if el.text: 
826          text_tag = etree.Element(tag) 
827          text_tag.text = el.text 
828          el.text = None 
829          el.insert(0, text_tag) 
830               
831 -def _merge_element_contents(el): 
832      """ 
833      Removes an element, but merges its contents into its place, e.g., 
834      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
835      <p>Hi there!</p> 
836      """ 
837      parent = el.getparent() 
838      text = el.text or '' 
839      if el.tail: 
840          if not len(el): 
841              text += el.tail 
842          else: 
843              if el[-1].tail: 
844                  el[-1].tail += el.tail 
845              else: 
846                  el[-1].tail = el.tail 
847      index = parent.index(el) 
848      if text: 
849          if index == 0: 
850              previous = None 
851          else: 
852              previous = parent[index-1] 
853          if previous is None: 
854              if parent.text: 
855                  parent.text += text 
856              else: 
857                  parent.text = text 
858          else: 
859              if previous.tail: 
860                  previous.tail += text 
861              else: 
862                  previous.tail = text 
863      parent[index:index+1] = el.getchildren() 
864   
865 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): 
866      """ 
867      Acts like SequenceMatcher, but tries not to find very small equal 
868      blocks amidst large spans of changes 
869      """ 
870   
871      threshold = 2 
872       
873 -    def get_matching_blocks(self): 
874          size = min(len(self.b), len(self.b)) 
875          threshold = min(self.threshold, size / 4) 
876          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
877          return [item for item in actual 
878                  if item[2] > threshold 
879                  or not item[2]] 
880   
881  if __name__ == '__main__': 
882      from lxml.html import _diffcommand 
883      _diffcommand.main() 
884