Package lxml :: Package html :: Module diff
Source Code for Module lxml.html.diff

  1  from __future__ import absolute_import 
  2   
  3  import difflib 
  4  from lxml import etree 
  5  from lxml.html import fragment_fromstring 
  6  import re 
  7   
  8  __all__ = ['html_annotate', 'htmldiff'] 
  9   
 10  try: 
 11      from html import escape as html_escape 
 12  except ImportError: 
 13      from cgi import escape as html_escape 
 14  try: 
 15      _unicode = unicode 
 16  except NameError: 
 17      # Python 3 
 18      _unicode = str 
 19  try: 
 20      basestring 
 21  except NameError: 
 22      # Python 3 
 23      basestring = str 
 24   
 25  ############################################################ 
 26  ## Annotation 
 27  ############################################################ 
 28   
 29 -def default_markup(text, version): 
 30      return '<span title="%s">%s</span>' % ( 
 31          html_escape(_unicode(version), 1), text) 
 32   
 33 -def html_annotate(doclist, markup=default_markup): 
 34      """ 
 35      doclist should be ordered from oldest to newest, like:: 
 36   
 37          >>> version1 = 'Hello World' 
 38          >>> version2 = 'Goodbye World' 
 39          >>> print(html_annotate([(version1, 'version 1'), 
 40          ...                      (version2, 'version 2')])) 
 41          <span title="version 2">Goodbye</span> <span title="version 1">World</span> 
 42   
 43      The documents must be *fragments* (str/UTF8 or unicode), not 
 44      complete documents 
 45   
 46      The markup argument is a function to markup the spans of words. 
 47      This function is called like markup('Hello', 'version 2'), and 
 48      returns HTML.  The first argument is text and never includes any 
 49      markup.  The default uses a span with a title: 
 50   
 51          >>> print(default_markup('Some Text', 'by Joe')) 
 52          <span title="by Joe">Some Text</span> 
 53      """ 
 54      # The basic strategy we have is to split the documents up into 
 55      # logical tokens (which are words with attached markup).  We then 
 56      # do diffs of each of the versions to track when a token first 
 57      # appeared in the document; the annotation attached to the token 
 58      # is the version where it first appeared. 
 59      tokenlist = [tokenize_annotated(doc, version) 
 60                   for doc, version in doclist] 
 61      cur_tokens = tokenlist[0] 
 62      for tokens in tokenlist[1:]: 
 63          html_annotate_merge_annotations(cur_tokens, tokens) 
 64          cur_tokens = tokens 
 65   
 66      # After we've tracked all the tokens, we can combine spans of text 
 67      # that are adjacent and have the same annotation 
 68      cur_tokens = compress_tokens(cur_tokens) 
 69      # And finally add markup 
 70      result = markup_serialize_tokens(cur_tokens, markup) 
 71      return ''.join(result).strip() 
 72   
 73 -def tokenize_annotated(doc, annotation):  
 74      """Tokenize a document and add an annotation attribute to each token 
 75      """ 
 76      tokens = tokenize(doc, include_hrefs=False) 
 77      for tok in tokens:  
 78          tok.annotation = annotation 
 79      return tokens 
 80   
 81 -def html_annotate_merge_annotations(tokens_old, tokens_new):  
 82      """Merge the annotations from tokens_old into tokens_new, when the 
 83      tokens in the new document already existed in the old document. 
 84      """ 
 85      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 86      commands = s.get_opcodes() 
 87   
 88      for command, i1, i2, j1, j2 in commands: 
 89          if command == 'equal':  
 90              eq_old = tokens_old[i1:i2] 
 91              eq_new = tokens_new[j1:j2] 
 92              copy_annotations(eq_old, eq_new) 
 93   
 94 -def copy_annotations(src, dest):  
 95      """ 
 96      Copy annotations from the tokens listed in src to the tokens in dest 
 97      """ 
 98      assert len(src) == len(dest) 
 99      for src_tok, dest_tok in zip(src, dest):  
100          dest_tok.annotation = src_tok.annotation 
101   
102 -def compress_tokens(tokens): 
103      """ 
104      Combine adjacent tokens when there is no HTML between the tokens,  
105      and they share an annotation 
106      """ 
107      result = [tokens[0]]  
108      for tok in tokens[1:]:  
109          if (not result[-1].post_tags and  
110              not tok.pre_tags and  
111              result[-1].annotation == tok.annotation):  
112              compress_merge_back(result, tok) 
113          else:  
114              result.append(tok) 
115      return result 
116   
117 -def compress_merge_back(tokens, tok):  
118      """ Merge tok into the last element of tokens (modifying the list of 
119      tokens in-place).  """ 
120      last = tokens[-1] 
121      if type(last) is not token or type(tok) is not token:  
122          tokens.append(tok) 
123      else: 
124          text = _unicode(last) 
125          if last.trailing_whitespace: 
126              text += last.trailing_whitespace 
127          text += tok 
128          merged = token(text, 
129                         pre_tags=last.pre_tags, 
130                         post_tags=tok.post_tags, 
131                         trailing_whitespace=tok.trailing_whitespace) 
132          merged.annotation = last.annotation 
133          tokens[-1] = merged 
134       
135 -def markup_serialize_tokens(tokens, markup_func): 
136      """ 
137      Serialize the list of tokens into a list of text chunks, calling 
138      markup_func around text to add annotations. 
139      """ 
140      for token in tokens: 
141          for pre in token.pre_tags: 
142              yield pre 
143          html = token.html() 
144          html = markup_func(html, token.annotation) 
145          if token.trailing_whitespace: 
146              html += token.trailing_whitespace 
147          yield html 
148          for post in token.post_tags: 
149              yield post 
150   
151   
152  ############################################################ 
153  ## HTML Diffs 
154  ############################################################ 
155   
156 -def htmldiff(old_html, new_html): 
157      ## FIXME: this should take parsed documents too, and use their body 
158      ## or other content. 
159      """ Do a diff of the old and new document.  The documents are HTML 
160      *fragments* (str/UTF8 or unicode), they are not complete documents 
161      (i.e., no <html> tag). 
162   
163      Returns HTML with <ins> and <del> tags added around the 
164      appropriate text.   
165   
166      Markup is generally ignored, with the markup from new_html 
167      preserved, and possibly some markup from old_html (though it is 
168      considered acceptable to lose some of the old markup).  Only the 
169      words in the HTML are diffed.  The exception is <img> tags, which 
170      are treated like words, and the href attribute of <a> tags, which 
171      are noted inside the tag itself when there are changes. 
172      """  
173      old_html_tokens = tokenize(old_html) 
174      new_html_tokens = tokenize(new_html) 
175      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
176      result = ''.join(result).strip() 
177      return fixup_ins_del_tags(result) 
178   
179 -def htmldiff_tokens(html1_tokens, html2_tokens): 
180      """ Does a diff on the tokens themselves, returning a list of text 
181      chunks (not tokens). 
182      """ 
183      # There are several passes as we do the differences.  The tokens 
184      # isolate the portion of the content we care to diff; difflib does 
185      # all the actual hard work at that point.   
186      # 
187      # Then we must create a valid document from pieces of both the old 
188      # document and the new document.  We generally prefer to take 
189      # markup from the new document, and only do a best effort attempt 
190      # to keep markup from the old document; anything that we can't 
191      # resolve we throw away.  Also we try to put the deletes as close 
192      # to the location where we think they would have been -- because 
193      # we are only keeping the markup from the new document, it can be 
194      # fuzzy where in the new document the old text would have gone. 
195      # Again we just do a best effort attempt. 
196      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
197      commands = s.get_opcodes() 
198      result = [] 
199      for command, i1, i2, j1, j2 in commands: 
200          if command == 'equal': 
201              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
202              continue 
203          if command == 'insert' or command == 'replace': 
204              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
205              merge_insert(ins_tokens, result) 
206          if command == 'delete' or command == 'replace': 
207              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
208              merge_delete(del_tokens, result) 
209      # If deletes were inserted directly as <del> then we'd have an 
210      # invalid document at this point.  Instead we put in special 
211      # markers, and when the complete diffed document has been created 
212      # we try to move the deletes around and resolve any problems. 
213      result = cleanup_delete(result) 
214   
215      return result 
216   
217 -def expand_tokens(tokens, equal=False): 
218      """Given a list of tokens, return a generator of the chunks of 
219      text for the data in the tokens. 
220      """ 
221      for token in tokens: 
222          for pre in token.pre_tags: 
223              yield pre 
224          if not equal or not token.hide_when_equal: 
225              if token.trailing_whitespace: 
226                  yield token.html() + token.trailing_whitespace 
227              else: 
228                  yield token.html() 
229          for post in token.post_tags: 
230              yield post 
231   
232 -def merge_insert(ins_chunks, doc): 
233      """ doc is the already-handled document (as a list of text chunks); 
234      here we add <ins>ins_chunks</ins> to the end of that.  """ 
235      # Though we don't throw away unbalanced_start or unbalanced_end 
236      # (we assume there is accompanying markup later or earlier in the 
237      # document), we only put <ins> around the balanced portion. 
238      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
239      doc.extend(unbalanced_start) 
240      if doc and not doc[-1].endswith(' '): 
241          # Fix up the case where the word before the insert didn't end with  
242          # a space 
243          doc[-1] += ' ' 
244      doc.append('<ins>') 
245      if balanced and balanced[-1].endswith(' '): 
246          # We move space outside of </ins> 
247          balanced[-1] = balanced[-1][:-1] 
248      doc.extend(balanced) 
249      doc.append('</ins> ') 
250      doc.extend(unbalanced_end) 
251   
252  # These are sentinals to represent the start and end of a <del> 
253  # segment, until we do the cleanup phase to turn them into proper 
254  # markup: 
255 -class DEL_START: 
256      pass 
257 -class DEL_END: 
258      pass 
259   
260 -class NoDeletes(Exception): 
261      """ Raised when the document no longer contains any pending deletes 
262      (DEL_START/DEL_END) """ 
263   
264 -def merge_delete(del_chunks, doc): 
265      """ Adds the text chunks in del_chunks to the document doc (another 
266      list of text chunks) with marker to show it is a delete. 
267      cleanup_delete later resolves these markers into <del> tags.""" 
268      doc.append(DEL_START) 
269      doc.extend(del_chunks) 
270      doc.append(DEL_END) 
271   
272 -def cleanup_delete(chunks): 
273      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
274      them with <del></del>.  To do this while keeping the document 
275      valid, it may need to drop some tags (either start or end tags). 
276   
277      It may also move the del into adjacent tags to try to move it to a 
278      similar location where it was originally located (e.g., moving a 
279      delete into preceding <div> tag, if the del looks like (DEL_START, 
280      'Text</div>', DEL_END)""" 
281      while 1: 
282          # Find a pending DEL_START/DEL_END, splitting the document 
283          # into stuff-preceding-DEL_START, stuff-inside, and 
284          # stuff-following-DEL_END 
285          try: 
286              pre_delete, delete, post_delete = split_delete(chunks) 
287          except NoDeletes: 
288              # Nothing found, we've cleaned up the entire doc 
289              break 
290          # The stuff-inside-DEL_START/END may not be well balanced 
291          # markup.  First we figure out what unbalanced portions there are: 
292          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
293          # Then we move the span forward and/or backward based on these 
294          # unbalanced portions: 
295          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
296          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
297          doc = pre_delete 
298          if doc and not doc[-1].endswith(' '): 
299              # Fix up case where the word before us didn't have a trailing space 
300              doc[-1] += ' ' 
301          doc.append('<del>') 
302          if balanced and balanced[-1].endswith(' '): 
303              # We move space outside of </del> 
304              balanced[-1] = balanced[-1][:-1] 
305          doc.extend(balanced) 
306          doc.append('</del> ') 
307          doc.extend(post_delete) 
308          chunks = doc 
309      return chunks 
310   
311 -def split_unbalanced(chunks): 
312      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
313      a list of text and tag chunks. 
314   
315      unbalanced_start is a list of all the tags that are opened, but 
316      not closed in this span.  Similarly, unbalanced_end is a list of 
317      tags that are closed but were not opened.  Extracting these might 
318      mean some reordering of the chunks.""" 
319      start = [] 
320      end = [] 
321      tag_stack = [] 
322      balanced = [] 
323      for chunk in chunks: 
324          if not chunk.startswith('<'): 
325              balanced.append(chunk) 
326              continue 
327          endtag = chunk[1] == '/' 
328          name = chunk.split()[0].strip('<>/') 
329          if name in empty_tags: 
330              balanced.append(chunk) 
331              continue 
332          if endtag: 
333              if tag_stack and tag_stack[-1][0] == name: 
334                  balanced.append(chunk) 
335                  name, pos, tag = tag_stack.pop() 
336                  balanced[pos] = tag 
337              elif tag_stack: 
338                  start.extend([tag for name, pos, tag in tag_stack]) 
339                  tag_stack = [] 
340                  end.append(chunk) 
341              else: 
342                  end.append(chunk) 
343          else: 
344              tag_stack.append((name, len(balanced), chunk)) 
345              balanced.append(None) 
346      start.extend( 
347          [chunk for name, pos, chunk in tag_stack]) 
348      balanced = [chunk for chunk in balanced if chunk is not None] 
349      return start, balanced, end 
350   
351 -def split_delete(chunks): 
352      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
353      stuff_after_DEL_END).  Returns the first case found (there may be 
354      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
355      there's no DEL_START found. """ 
356      try: 
357          pos = chunks.index(DEL_START) 
358      except ValueError: 
359          raise NoDeletes 
360      pos2 = chunks.index(DEL_END) 
361      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
362   
363 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): 
364      """ pre_delete and post_delete implicitly point to a place in the 
365      document (where the two were split).  This moves that point (by 
366      popping items from one and pushing them onto the other).  It moves 
367      the point to try to find a place where unbalanced_start applies. 
368   
369      As an example:: 
370   
371          >>> unbalanced_start = ['<div>'] 
372          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
373          >>> pre, post = doc[:3], doc[3:] 
374          >>> pre, post 
375          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
376          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
377          >>> pre, post 
378          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
379   
380      As you can see, we moved the point so that the dangling <div> that 
381      we found will be effectively replaced by the div in the original 
382      document.  If this doesn't work out, we just throw away 
383      unbalanced_start without doing anything. 
384      """ 
385      while 1: 
386          if not unbalanced_start: 
387              # We have totally succeeded in finding the position 
388              break 
389          finding = unbalanced_start[0] 
390          finding_name = finding.split()[0].strip('<>') 
391          if not post_delete: 
392              break 
393          next = post_delete[0] 
394          if next is DEL_START or not next.startswith('<'): 
395              # Reached a word, we can't move the delete text forward 
396              break 
397          if next[1] == '/': 
398              # Reached a closing tag, can we go further?  Maybe not... 
399              break 
400          name = next.split()[0].strip('<>') 
401          if name == 'ins': 
402              # Can't move into an insert 
403              break 
404          assert name != 'del', ( 
405              "Unexpected delete tag: %r" % next) 
406          if name == finding_name: 
407              unbalanced_start.pop(0) 
408              pre_delete.append(post_delete.pop(0)) 
409          else: 
410              # Found a tag that doesn't match 
411              break 
412   
413 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): 
414      """ like locate_unbalanced_start, except handling end tags and 
415      possibly moving the point earlier in the document.  """ 
416      while 1: 
417          if not unbalanced_end: 
418              # Success 
419              break 
420          finding = unbalanced_end[-1] 
421          finding_name = finding.split()[0].strip('<>/') 
422          if not pre_delete: 
423              break 
424          next = pre_delete[-1] 
425          if next is DEL_END or not next.startswith('</'): 
426              # A word or a start tag 
427              break 
428          name = next.split()[0].strip('<>/') 
429          if name == 'ins' or name == 'del': 
430              # Can't move into an insert or delete 
431              break 
432          if name == finding_name: 
433              unbalanced_end.pop() 
434              post_delete.insert(0, pre_delete.pop()) 
435          else: 
436              # Found a tag that doesn't match 
437              break 
438   
439 -class token(_unicode): 
440      """ Represents a diffable token, generally a word that is displayed to 
441      the user.  Opening tags are attached to this token when they are 
442      adjacent (pre_tags) and closing tags that follow the word 
443      (post_tags).  Some exceptions occur when there are empty tags 
444      adjacent to a word, so there may be close tags in pre_tags, or 
445      open tags in post_tags. 
446   
447      We also keep track of whether the word was originally followed by 
448      whitespace, even though we do not want to treat the word as 
449      equivalent to a similar word that does not have a trailing 
450      space.""" 
451   
452      # When this is true, the token will be eliminated from the 
453      # displayed diff if no change has occurred: 
454      hide_when_equal = False 
455   
456 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""): 
457          obj = _unicode.__new__(cls, text) 
458   
459          if pre_tags is not None: 
460              obj.pre_tags = pre_tags 
461          else: 
462              obj.pre_tags = [] 
463   
464          if post_tags is not None: 
465              obj.post_tags = post_tags 
466          else: 
467              obj.post_tags = [] 
468   
469          obj.trailing_whitespace = trailing_whitespace 
470   
471          return obj 
472   
473 -    def __repr__(self): 
474          return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 
475                                            self.post_tags, self.trailing_whitespace) 
476   
477 -    def html(self): 
478          return _unicode(self) 
479   
480 -class tag_token(token): 
481   
482      """ Represents a token that is actually a tag.  Currently this is just 
483      the <img> tag, which takes up visible space just like a word but 
484      is only represented in a document by a tag.  """ 
485   
486 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
487                  post_tags=None, trailing_whitespace=""): 
488          obj = token.__new__(cls, "%s: %s" % (type, data),  
489                              pre_tags=pre_tags,  
490                              post_tags=post_tags,  
491                              trailing_whitespace=trailing_whitespace) 
492          obj.tag = tag 
493          obj.data = data 
494          obj.html_repr = html_repr 
495          return obj 
496   
497 -    def __repr__(self): 
498          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 
499              self.tag,  
500              self.data,  
501              self.html_repr,  
502              self.pre_tags,  
503              self.post_tags,  
504              self.trailing_whitespace) 
505 -    def html(self): 
506          return self.html_repr 
507   
508 -class href_token(token): 
509   
510      """ Represents the href in an anchor tag.  Unlike other words, we only 
511      show the href when it changes.  """ 
512   
513      hide_when_equal = True 
514   
515 -    def html(self): 
516          return ' Link: %s' % self 
517   
518 -def tokenize(html, include_hrefs=True): 
519      """ 
520      Parse the given HTML and returns token objects (words with attached tags). 
521   
522      This parses only the content of a page; anything in the head is 
523      ignored, and the <head> and <body> elements are themselves 
524      optional.  The content is then parsed by lxml, which ensures the 
525      validity of the resulting parsed document (though lxml may make 
526      incorrect guesses when the markup is particular bad). 
527   
528      <ins> and <del> tags are also eliminated from the document, as 
529      that gets confusing. 
530   
531      If include_hrefs is true, then the href attribute of <a> tags is 
532      included as a special kind of diffable token.""" 
533      if etree.iselement(html): 
534          body_el = html 
535      else: 
536          body_el = parse_html(html, cleanup=True) 
537      # Then we split the document into text chunks for each tag, word, and end tag: 
538      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
539      # Finally re-joining them into token objects: 
540      return fixup_chunks(chunks) 
541   
542 -def parse_html(html, cleanup=True): 
543      """ 
544      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
545      wrapped in a <div> tag that was not in the original document. 
546   
547      If cleanup is true, make sure there's no <head> or <body>, and get 
548      rid of any <ins> and <del> tags. 
549      """ 
550      if cleanup: 
551          # This removes any extra markup or structure like <head>: 
552          html = cleanup_html(html) 
553      return fragment_fromstring(html, create_parent=True) 
554   
555  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
556  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
557  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
558   
559 -def cleanup_html(html): 
560      """ This 'cleans' the HTML, meaning that any page structure is removed 
561      (only the contents of <body> are used, if there is any <body). 
562      Also <ins> and <del> tags are removed.  """ 
563      match = _body_re.search(html) 
564      if match: 
565          html = html[match.end():] 
566      match = _end_body_re.search(html) 
567      if match: 
568          html = html[:match.start()] 
569      html = _ins_del_re.sub('', html) 
570      return html 
571       
572   
573  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
574   
575 -def split_trailing_whitespace(word): 
576      """ 
577      This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 
578      """ 
579      stripped_length = len(word.rstrip()) 
580      return word[0:stripped_length], word[stripped_length:] 
581   
582   
583 -def fixup_chunks(chunks): 
584      """ 
585      This function takes a list of chunks and produces a list of tokens. 
586      """ 
587      tag_accum = [] 
588      cur_word = None 
589      result = [] 
590      for chunk in chunks: 
591          if isinstance(chunk, tuple): 
592              if chunk[0] == 'img': 
593                  src = chunk[1] 
594                  tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 
595                  cur_word = tag_token('img', src, html_repr=tag, 
596                                       pre_tags=tag_accum, 
597                                       trailing_whitespace=trailing_whitespace) 
598                  tag_accum = [] 
599                  result.append(cur_word) 
600   
601              elif chunk[0] == 'href': 
602                  href = chunk[1] 
603                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 
604                  tag_accum = [] 
605                  result.append(cur_word) 
606              continue 
607   
608          if is_word(chunk): 
609              chunk, trailing_whitespace = split_trailing_whitespace(chunk) 
610              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
611              tag_accum = [] 
612              result.append(cur_word) 
613   
614          elif is_start_tag(chunk): 
615              tag_accum.append(chunk) 
616   
617          elif is_end_tag(chunk): 
618              if tag_accum: 
619                  tag_accum.append(chunk) 
620              else: 
621                  assert cur_word, ( 
622                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
623                      % (cur_word, result, chunk, chunks)) 
624                  cur_word.post_tags.append(chunk) 
625          else: 
626              assert(0) 
627   
628      if not result: 
629          return [token('', pre_tags=tag_accum)] 
630      else: 
631          result[-1].post_tags.extend(tag_accum) 
632   
633      return result 
634   
635   
636  # All the tags in HTML that don't require end tags: 
637  empty_tags = ( 
638      'param', 'img', 'area', 'br', 'basefont', 'input', 
639      'base', 'meta', 'link', 'col') 
640   
641  block_level_tags = ( 
642      'address', 
643      'blockquote', 
644      'center', 
645      'dir', 
646      'div', 
647      'dl', 
648      'fieldset', 
649      'form', 
650      'h1', 
651      'h2', 
652      'h3', 
653      'h4', 
654      'h5', 
655      'h6', 
656      'hr', 
657      'isindex', 
658      'menu', 
659      'noframes', 
660      'noscript', 
661      'ol', 
662      'p', 
663      'pre', 
664      'table', 
665      'ul', 
666      ) 
667   
668  block_level_container_tags = ( 
669      'dd', 
670      'dt', 
671      'frameset', 
672      'li', 
673      'tbody', 
674      'td', 
675      'tfoot', 
676      'th', 
677      'thead', 
678      'tr', 
679      ) 
680   
681   
682 -def flatten_el(el, include_hrefs, skip_tag=False): 
683      """ Takes an lxml element el, and generates all the text chunks for 
684      that tag.  Each start tag is a chunk, each word is a chunk, and each 
685      end tag is a chunk. 
686   
687      If skip_tag is true, then the outermost container tag is 
688      not returned (just its contents).""" 
689      if not skip_tag: 
690          if el.tag == 'img': 
691              yield ('img', el.get('src'), start_tag(el)) 
692          else: 
693              yield start_tag(el) 
694      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
695          return 
696      start_words = split_words(el.text) 
697      for word in start_words: 
698          yield html_escape(word) 
699      for child in el: 
700          for item in flatten_el(child, include_hrefs=include_hrefs): 
701              yield item 
702      if el.tag == 'a' and el.get('href') and include_hrefs: 
703          yield ('href', el.get('href')) 
704      if not skip_tag: 
705          yield end_tag(el) 
706          end_words = split_words(el.tail) 
707          for word in end_words: 
708              yield html_escape(word) 
709   
710  split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 
711   
712 -def split_words(text): 
713      """ Splits some text into words. Includes trailing whitespace 
714      on each word when appropriate.  """ 
715      if not text or not text.strip(): 
716          return [] 
717   
718      words = split_words_re.findall(text) 
719      return words 
720   
721  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
722   
723 -def start_tag(el): 
724      """ 
725      The text representation of the start tag for a tag. 
726      """ 
727      return '<%s%s>' % ( 
728          el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 
729                           for name, value in el.attrib.items()])) 
730   
731 -def end_tag(el): 
732      """ The text representation of an end tag for a tag.  Includes 
733      trailing whitespace when appropriate.  """ 
734      if el.tail and start_whitespace_re.search(el.tail): 
735          extra = ' ' 
736      else: 
737          extra = '' 
738      return '</%s>%s' % (el.tag, extra) 
739   
740 -def is_word(tok): 
741      return not tok.startswith('<') 
742   
743 -def is_end_tag(tok): 
744      return tok.startswith('</') 
745   
746 -def is_start_tag(tok): 
747      return tok.startswith('<') and not tok.startswith('</') 
748   
749 -def fixup_ins_del_tags(html): 
750      """ Given an html string, move any <ins> or <del> tags inside of any 
751      block-level elements, e.g. transform <ins><p>word</p></ins> to 
752      <p><ins>word</ins></p> """ 
753      doc = parse_html(html, cleanup=False) 
754      _fixup_ins_del_tags(doc) 
755      html = serialize_html_fragment(doc, skip_outer=True) 
756      return html 
757   
758 -def serialize_html_fragment(el, skip_outer=False): 
759      """ Serialize a single lxml element as HTML.  The serialized form 
760      includes the elements tail.   
761   
762      If skip_outer is true, then don't serialize the outermost tag 
763      """ 
764      assert not isinstance(el, basestring), ( 
765          "You should pass in an element, not a string like %r" % el) 
766      html = etree.tostring(el, method="html", encoding=_unicode) 
767      if skip_outer: 
768          # Get rid of the extra starting tag: 
769          html = html[html.find('>')+1:] 
770          # Get rid of the extra end tag: 
771          html = html[:html.rfind('<')] 
772          return html.strip() 
773      else: 
774          return html 
775   
776 -def _fixup_ins_del_tags(doc): 
777      """fixup_ins_del_tags that works on an lxml document in-place 
778      """ 
779      for tag in ['ins', 'del']: 
780          for el in doc.xpath('descendant-or-self::%s' % tag): 
781              if not _contains_block_level_tag(el): 
782                  continue 
783              _move_el_inside_block(el, tag=tag) 
784              el.drop_tag() 
785              #_merge_element_contents(el) 
786   
787 -def _contains_block_level_tag(el): 
788      """True if the element contains any block-level elements, like <p>, <td>, etc. 
789      """ 
790      if el.tag in block_level_tags or el.tag in block_level_container_tags: 
791          return True 
792      for child in el: 
793          if _contains_block_level_tag(child): 
794              return True 
795      return False 
796   
797 -def _move_el_inside_block(el, tag): 
798      """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 
799      and moves them inside any block-level tags.  """ 
800      for child in el: 
801          if _contains_block_level_tag(child): 
802              break 
803      else: 
804          import sys 
805          # No block-level tags in any child 
806          children_tag = etree.Element(tag) 
807          children_tag.text = el.text 
808          el.text = None 
809          children_tag.extend(list(el)) 
810          el[:] = [children_tag] 
811          return 
812      for child in list(el): 
813          if _contains_block_level_tag(child): 
814              _move_el_inside_block(child, tag) 
815              if child.tail: 
816                  tail_tag = etree.Element(tag) 
817                  tail_tag.text = child.tail 
818                  child.tail = None 
819                  el.insert(el.index(child)+1, tail_tag) 
820          else: 
821              child_tag = etree.Element(tag) 
822              el.replace(child, child_tag) 
823              child_tag.append(child) 
824      if el.text: 
825          text_tag = etree.Element(tag) 
826          text_tag.text = el.text 
827          el.text = None 
828          el.insert(0, text_tag) 
829               
830 -def _merge_element_contents(el): 
831      """ 
832      Removes an element, but merges its contents into its place, e.g., 
833      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
834      <p>Hi there!</p> 
835      """ 
836      parent = el.getparent() 
837      text = el.text or '' 
838      if el.tail: 
839          if not len(el): 
840              text += el.tail 
841          else: 
842              if el[-1].tail: 
843                  el[-1].tail += el.tail 
844              else: 
845                  el[-1].tail = el.tail 
846      index = parent.index(el) 
847      if text: 
848          if index == 0: 
849              previous = None 
850          else: 
851              previous = parent[index-1] 
852          if previous is None: 
853              if parent.text: 
854                  parent.text += text 
855              else: 
856                  parent.text = text 
857          else: 
858              if previous.tail: 
859                  previous.tail += text 
860              else: 
861                  previous.tail = text 
862      parent[index:index+1] = el.getchildren() 
863   
864 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): 
865      """ 
866      Acts like SequenceMatcher, but tries not to find very small equal 
867      blocks amidst large spans of changes 
868      """ 
869   
870      threshold = 2 
871       
872 -    def get_matching_blocks(self): 
873          size = min(len(self.b), len(self.b)) 
874          threshold = min(self.threshold, size / 4) 
875          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
876          return [item for item in actual 
877                  if item[2] > threshold 
878                  or not item[2]] 
879   
880  if __name__ == '__main__': 
881      from lxml.html import _diffcommand 
882      _diffcommand.main() 
883