Package lxml :: Package html :: Module diff
Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import cgi 
  5  import re 
  6   
  7  __all__ = ['html_annotate', 'htmldiff'] 
  8   
  9  try: 
 10      _unicode = unicode 
 11  except NameError: 
 12      # Python 3 
 13      _unicode = str 
 14  try: 
 15      basestring = __builtins__["basestring"] 
 16  except (KeyError, NameError): 
 17      # Python 3 
 18      basestring = str 
 19   
 20  ############################################################ 
 21  ## Annotation 
 22  ############################################################ 
 23   
 24 -def default_markup(text, version): 
 25      return '<span title="%s">%s</span>' % ( 
 26          cgi.escape(_unicode(version), 1), text) 
 27   
 28 -def html_annotate(doclist, markup=default_markup): 
 29      """ 
 30      doclist should be ordered from oldest to newest, like:: 
 31   
 32          >>> version1 = 'Hello World' 
 33          >>> version2 = 'Goodbye World' 
 34          >>> print(html_annotate([(version1, 'version 1'), 
 35          ...                      (version2, 'version 2')])) 
 36          <span title="version 2">Goodbye</span> <span title="version 1">World</span> 
 37   
 38      The documents must be *fragments* (str/UTF8 or unicode), not 
 39      complete documents 
 40   
 41      The markup argument is a function to markup the spans of words. 
 42      This function is called like markup('Hello', 'version 2'), and 
 43      returns HTML.  The first argument is text and never includes any 
 44      markup.  The default uses a span with a title: 
 45   
 46          >>> print(default_markup('Some Text', 'by Joe')) 
 47          <span title="by Joe">Some Text</span> 
 48      """ 
 49      # The basic strategy we have is to split the documents up into 
 50      # logical tokens (which are words with attached markup).  We then 
 51      # do diffs of each of the versions to track when a token first 
 52      # appeared in the document; the annotation attached to the token 
 53      # is the version where it first appeared. 
 54      tokenlist = [tokenize_annotated(doc, version) 
 55                   for doc, version in doclist] 
 56      cur_tokens = tokenlist[0] 
 57      for tokens in tokenlist[1:]: 
 58          html_annotate_merge_annotations(cur_tokens, tokens) 
 59          cur_tokens = tokens 
 60   
 61      # After we've tracked all the tokens, we can combine spans of text 
 62      # that are adjacent and have the same annotation 
 63      cur_tokens = compress_tokens(cur_tokens) 
 64      # And finally add markup 
 65      result = markup_serialize_tokens(cur_tokens, markup) 
 66      return ''.join(result).strip() 
 67   
 68 -def tokenize_annotated(doc, annotation):  
 69      """Tokenize a document and add an annotation attribute to each token 
 70      """ 
 71      tokens = tokenize(doc, include_hrefs=False) 
 72      for tok in tokens:  
 73          tok.annotation = annotation 
 74      return tokens 
 75   
 76 -def html_annotate_merge_annotations(tokens_old, tokens_new):  
 77      """Merge the annotations from tokens_old into tokens_new, when the 
 78      tokens in the new document already existed in the old document. 
 79      """ 
 80      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 81      commands = s.get_opcodes() 
 82   
 83      for command, i1, i2, j1, j2 in commands: 
 84          if command == 'equal':  
 85              eq_old = tokens_old[i1:i2] 
 86              eq_new = tokens_new[j1:j2] 
 87              copy_annotations(eq_old, eq_new) 
 88   
 89 -def copy_annotations(src, dest):  
 90      """ 
 91      Copy annotations from the tokens listed in src to the tokens in dest 
 92      """ 
 93      assert len(src) == len(dest) 
 94      for src_tok, dest_tok in zip(src, dest):  
 95          dest_tok.annotation = src_tok.annotation 
 96   
 97 -def compress_tokens(tokens): 
 98      """ 
 99      Combine adjacent tokens when there is no HTML between the tokens,  
100      and they share an annotation 
101      """ 
102      result = [tokens[0]]  
103      for tok in tokens[1:]:  
104          if (not result[-1].post_tags and  
105              not tok.pre_tags and  
106              result[-1].annotation == tok.annotation):  
107              compress_merge_back(result, tok) 
108          else:  
109              result.append(tok) 
110      return result 
111   
112 -def compress_merge_back(tokens, tok):  
113      """ Merge tok into the last element of tokens (modifying the list of 
114      tokens in-place).  """ 
115      last = tokens[-1] 
116      if type(last) is not token or type(tok) is not token:  
117          tokens.append(tok) 
118      else: 
119          text = _unicode(last) 
120          if last.trailing_whitespace: 
121              text += ' ' 
122          text += tok 
123          merged = token(text, 
124                         pre_tags=last.pre_tags, 
125                         post_tags=tok.post_tags, 
126                         trailing_whitespace=tok.trailing_whitespace) 
127          merged.annotation = last.annotation 
128          tokens[-1] = merged 
129       
130 -def markup_serialize_tokens(tokens, markup_func): 
131      """ 
132      Serialize the list of tokens into a list of text chunks, calling 
133      markup_func around text to add annotations. 
134      """ 
135      for token in tokens: 
136          for pre in token.pre_tags: 
137              yield pre 
138          html = token.html() 
139          html = markup_func(html, token.annotation) 
140          if token.trailing_whitespace: 
141              html += ' ' 
142          yield html 
143          for post in token.post_tags: 
144              yield post 
145   
146   
147  ############################################################ 
148  ## HTML Diffs 
149  ############################################################ 
150   
151 -def htmldiff(old_html, new_html): 
152      ## FIXME: this should take parsed documents too, and use their body 
153      ## or other content. 
154      """ Do a diff of the old and new document.  The documents are HTML 
155      *fragments* (str/UTF8 or unicode), they are not complete documents 
156      (i.e., no <html> tag). 
157   
158      Returns HTML with <ins> and <del> tags added around the 
159      appropriate text.   
160   
161      Markup is generally ignored, with the markup from new_html 
162      preserved, and possibly some markup from old_html (though it is 
163      considered acceptable to lose some of the old markup).  Only the 
164      words in the HTML are diffed.  The exception is <img> tags, which 
165      are treated like words, and the href attribute of <a> tags, which 
166      are noted inside the tag itself when there are changes. 
167      """  
168      old_html_tokens = tokenize(old_html) 
169      new_html_tokens = tokenize(new_html) 
170      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
171      result = ''.join(result).strip() 
172      return fixup_ins_del_tags(result) 
173   
174 -def htmldiff_tokens(html1_tokens, html2_tokens): 
175      """ Does a diff on the tokens themselves, returning a list of text 
176      chunks (not tokens). 
177      """ 
178      # There are several passes as we do the differences.  The tokens 
179      # isolate the portion of the content we care to diff; difflib does 
180      # all the actual hard work at that point.   
181      # 
182      # Then we must create a valid document from pieces of both the old 
183      # document and the new document.  We generally prefer to take 
184      # markup from the new document, and only do a best effort attempt 
185      # to keep markup from the old document; anything that we can't 
186      # resolve we throw away.  Also we try to put the deletes as close 
187      # to the location where we think they would have been -- because 
188      # we are only keeping the markup from the new document, it can be 
189      # fuzzy where in the new document the old text would have gone. 
190      # Again we just do a best effort attempt. 
191      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
192      commands = s.get_opcodes() 
193      result = [] 
194      for command, i1, i2, j1, j2 in commands: 
195          if command == 'equal': 
196              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
197              continue 
198          if command == 'insert' or command == 'replace': 
199              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
200              merge_insert(ins_tokens, result) 
201          if command == 'delete' or command == 'replace': 
202              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
203              merge_delete(del_tokens, result) 
204      # If deletes were inserted directly as <del> then we'd have an 
205      # invalid document at this point.  Instead we put in special 
206      # markers, and when the complete diffed document has been created 
207      # we try to move the deletes around and resolve any problems. 
208      result = cleanup_delete(result) 
209   
210      return result 
211   
212 -def expand_tokens(tokens, equal=False): 
213      """Given a list of tokens, return a generator of the chunks of 
214      text for the data in the tokens. 
215      """ 
216      for token in tokens: 
217          for pre in token.pre_tags: 
218              yield pre 
219          if not equal or not token.hide_when_equal: 
220              if token.trailing_whitespace: 
221                  yield token.html() + ' ' 
222              else: 
223                  yield token.html() 
224          for post in token.post_tags: 
225              yield post 
226   
227 -def merge_insert(ins_chunks, doc): 
228      """ doc is the already-handled document (as a list of text chunks); 
229      here we add <ins>ins_chunks</ins> to the end of that.  """ 
230      # Though we don't throw away unbalanced_start or unbalanced_end 
231      # (we assume there is accompanying markup later or earlier in the 
232      # document), we only put <ins> around the balanced portion. 
233      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
234      doc.extend(unbalanced_start) 
235      if doc and not doc[-1].endswith(' '): 
236          # Fix up the case where the word before the insert didn't end with  
237          # a space 
238          doc[-1] += ' ' 
239      doc.append('<ins>') 
240      if balanced and balanced[-1].endswith(' '): 
241          # We move space outside of </ins> 
242          balanced[-1] = balanced[-1][:-1] 
243      doc.extend(balanced) 
244      doc.append('</ins> ') 
245      doc.extend(unbalanced_end) 
246   
247  # These are sentinals to represent the start and end of a <del> 
248  # segment, until we do the cleanup phase to turn them into proper 
249  # markup: 
250 -class DEL_START: 
251      pass 
252 -class DEL_END: 
253      pass 
254   
255 -class NoDeletes(Exception): 
256      """ Raised when the document no longer contains any pending deletes 
257      (DEL_START/DEL_END) """ 
258   
259 -def merge_delete(del_chunks, doc): 
260      """ Adds the text chunks in del_chunks to the document doc (another 
261      list of text chunks) with marker to show it is a delete. 
262      cleanup_delete later resolves these markers into <del> tags.""" 
263      doc.append(DEL_START) 
264      doc.extend(del_chunks) 
265      doc.append(DEL_END) 
266   
267 -def cleanup_delete(chunks): 
268      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
269      them with <del></del>.  To do this while keeping the document 
270      valid, it may need to drop some tags (either start or end tags). 
271   
272      It may also move the del into adjacent tags to try to move it to a 
273      similar location where it was originally located (e.g., moving a 
274      delete into preceding <div> tag, if the del looks like (DEL_START, 
275      'Text</div>', DEL_END)""" 
276      while 1: 
277          # Find a pending DEL_START/DEL_END, splitting the document 
278          # into stuff-preceding-DEL_START, stuff-inside, and 
279          # stuff-following-DEL_END 
280          try: 
281              pre_delete, delete, post_delete = split_delete(chunks) 
282          except NoDeletes: 
283              # Nothing found, we've cleaned up the entire doc 
284              break 
285          # The stuff-inside-DEL_START/END may not be well balanced 
286          # markup.  First we figure out what unbalanced portions there are: 
287          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
288          # Then we move the span forward and/or backward based on these 
289          # unbalanced portions: 
290          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
291          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
292          doc = pre_delete 
293          if doc and not doc[-1].endswith(' '): 
294              # Fix up case where the word before us didn't have a trailing space 
295              doc[-1] += ' ' 
296          doc.append('<del>') 
297          if balanced and balanced[-1].endswith(' '): 
298              # We move space outside of </del> 
299              balanced[-1] = balanced[-1][:-1] 
300          doc.extend(balanced) 
301          doc.append('</del> ') 
302          doc.extend(post_delete) 
303          chunks = doc 
304      return chunks 
305   
306 -def split_unbalanced(chunks): 
307      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
308      a list of text and tag chunks. 
309   
310      unbalanced_start is a list of all the tags that are opened, but 
311      not closed in this span.  Similarly, unbalanced_end is a list of 
312      tags that are closed but were not opened.  Extracting these might 
313      mean some reordering of the chunks.""" 
314      start = [] 
315      end = [] 
316      tag_stack = [] 
317      balanced = [] 
318      for chunk in chunks: 
319          if not chunk.startswith('<'): 
320              balanced.append(chunk) 
321              continue 
322          endtag = chunk[1] == '/' 
323          name = chunk.split()[0].strip('<>/') 
324          if name in empty_tags: 
325              balanced.append(chunk) 
326              continue 
327          if endtag: 
328              if tag_stack and tag_stack[-1][0] == name: 
329                  balanced.append(chunk) 
330                  name, pos, tag = tag_stack.pop() 
331                  balanced[pos] = tag 
332              elif tag_stack: 
333                  start.extend([tag for name, pos, tag in tag_stack]) 
334                  tag_stack = [] 
335                  end.append(chunk) 
336              else: 
337                  end.append(chunk) 
338          else: 
339              tag_stack.append((name, len(balanced), chunk)) 
340              balanced.append(None) 
341      start.extend( 
342          [chunk for name, pos, chunk in tag_stack]) 
343      balanced = [chunk for chunk in balanced if chunk is not None] 
344      return start, balanced, end 
345   
346 -def split_delete(chunks): 
347      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
348      stuff_after_DEL_END).  Returns the first case found (there may be 
349      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
350      there's no DEL_START found. """ 
351      try: 
352          pos = chunks.index(DEL_START) 
353      except ValueError: 
354          raise NoDeletes 
355      pos2 = chunks.index(DEL_END) 
356      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
357   
358 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): 
359      """ pre_delete and post_delete implicitly point to a place in the 
360      document (where the two were split).  This moves that point (by 
361      popping items from one and pushing them onto the other).  It moves 
362      the point to try to find a place where unbalanced_start applies. 
363   
364      As an example:: 
365   
366          >>> unbalanced_start = ['<div>'] 
367          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
368          >>> pre, post = doc[:3], doc[3:] 
369          >>> pre, post 
370          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
371          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
372          >>> pre, post 
373          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
374   
375      As you can see, we moved the point so that the dangling <div> that 
376      we found will be effectively replaced by the div in the original 
377      document.  If this doesn't work out, we just throw away 
378      unbalanced_start without doing anything. 
379      """ 
380      while 1: 
381          if not unbalanced_start: 
382              # We have totally succeded in finding the position 
383              break 
384          finding = unbalanced_start[0] 
385          finding_name = finding.split()[0].strip('<>') 
386          if not post_delete: 
387              break 
388          next = post_delete[0] 
389          if next is DEL_START or not next.startswith('<'): 
390              # Reached a word, we can't move the delete text forward 
391              break 
392          if next[1] == '/': 
393              # Reached a closing tag, can we go further?  Maybe not... 
394              break 
395          name = next.split()[0].strip('<>') 
396          if name == 'ins': 
397              # Can't move into an insert 
398              break 
399          assert name != 'del', ( 
400              "Unexpected delete tag: %r" % next) 
401          if name == finding_name: 
402              unbalanced_start.pop(0) 
403              pre_delete.append(post_delete.pop(0)) 
404          else: 
405              # Found a tag that doesn't match 
406              break 
407   
408 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): 
409      """ like locate_unbalanced_start, except handling end tags and 
410      possibly moving the point earlier in the document.  """ 
411      while 1: 
412          if not unbalanced_end: 
413              # Success 
414              break 
415          finding = unbalanced_end[-1] 
416          finding_name = finding.split()[0].strip('<>/') 
417          if not pre_delete: 
418              break 
419          next = pre_delete[-1] 
420          if next is DEL_END or not next.startswith('</'): 
421              # A word or a start tag 
422              break 
423          name = next.split()[0].strip('<>/') 
424          if name == 'ins' or name == 'del': 
425              # Can't move into an insert or delete 
426              break 
427          if name == finding_name: 
428              unbalanced_end.pop() 
429              post_delete.insert(0, pre_delete.pop()) 
430          else: 
431              # Found a tag that doesn't match 
432              break 
433   
434 -class token(_unicode): 
435      """ Represents a diffable token, generally a word that is displayed to 
436      the user.  Opening tags are attached to this token when they are 
437      adjacent (pre_tags) and closing tags that follow the word 
438      (post_tags).  Some exceptions occur when there are empty tags 
439      adjacent to a word, so there may be close tags in pre_tags, or 
440      open tags in post_tags. 
441   
442      We also keep track of whether the word was originally followed by 
443      whitespace, even though we do not want to treat the word as 
444      equivalent to a similar word that does not have a trailing 
445      space.""" 
446   
447      # When this is true, the token will be eliminated from the 
448      # displayed diff if no change has occurred: 
449      hide_when_equal = False 
450   
451 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): 
452          obj = _unicode.__new__(cls, text) 
453   
454          if pre_tags is not None: 
455              obj.pre_tags = pre_tags 
456          else: 
457              obj.pre_tags = [] 
458   
459          if post_tags is not None: 
460              obj.post_tags = post_tags 
461          else: 
462              obj.post_tags = [] 
463   
464          obj.trailing_whitespace = trailing_whitespace 
465   
466          return obj 
467   
468 -    def __repr__(self): 
469          return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags) 
470   
471 -    def html(self): 
472          return _unicode(self) 
473   
474 -class tag_token(token): 
475   
476      """ Represents a token that is actually a tag.  Currently this is just 
477      the <img> tag, which takes up visible space just like a word but 
478      is only represented in a document by a tag.  """ 
479   
480 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
481                  post_tags=None, trailing_whitespace=False): 
482          obj = token.__new__(cls, "%s: %s" % (type, data),  
483                              pre_tags=pre_tags,  
484                              post_tags=post_tags,  
485                              trailing_whitespace=trailing_whitespace) 
486          obj.tag = tag 
487          obj.data = data 
488          obj.html_repr = html_repr 
489          return obj 
490   
491 -    def __repr__(self): 
492          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 
493              self.tag,  
494              self.data,  
495              self.html_repr,  
496              self.pre_tags,  
497              self.post_tags,  
498              self.trailing_whitespace) 
499 -    def html(self): 
500          return self.html_repr 
501   
502 -class href_token(token): 
503   
504      """ Represents the href in an anchor tag.  Unlike other words, we only 
505      show the href when it changes.  """ 
506   
507      hide_when_equal = True 
508   
509 -    def html(self): 
510          return ' Link: %s' % self 
511   
512 -def tokenize(html, include_hrefs=True): 
513      """ 
514      Parse the given HTML and returns token objects (words with attached tags). 
515   
516      This parses only the content of a page; anything in the head is 
517      ignored, and the <head> and <body> elements are themselves 
518      optional.  The content is then parsed by lxml, which ensures the 
519      validity of the resulting parsed document (though lxml may make 
520      incorrect guesses when the markup is particular bad). 
521   
522      <ins> and <del> tags are also eliminated from the document, as 
523      that gets confusing. 
524   
525      If include_hrefs is true, then the href attribute of <a> tags is 
526      included as a special kind of diffable token.""" 
527      if etree.iselement(html): 
528          body_el = html 
529      else: 
530          body_el = parse_html(html, cleanup=True) 
531      # Then we split the document into text chunks for each tag, word, and end tag: 
532      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
533      # Finally re-joining them into token objects: 
534      return fixup_chunks(chunks) 
535   
536 -def parse_html(html, cleanup=True): 
537      """ 
538      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
539      wrapped in a <div> tag that was not in the original document. 
540   
541      If cleanup is true, make sure there's no <head> or <body>, and get 
542      rid of any <ins> and <del> tags. 
543      """ 
544      if cleanup: 
545          # This removes any extra markup or structure like <head>: 
546          html = cleanup_html(html) 
547      return fragment_fromstring(html, create_parent=True) 
548   
549  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
550  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
551  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
552   
553 -def cleanup_html(html): 
554      """ This 'cleans' the HTML, meaning that any page structure is removed 
555      (only the contents of <body> are used, if there is any <body). 
556      Also <ins> and <del> tags are removed.  """ 
557      match = _body_re.search(html) 
558      if match: 
559          html = html[match.end():] 
560      match = _end_body_re.search(html) 
561      if match: 
562          html = html[:match.start()] 
563      html = _ins_del_re.sub('', html) 
564      return html 
565       
566   
567  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
568   
569 -def fixup_chunks(chunks): 
570      """ 
571      This function takes a list of chunks and produces a list of tokens. 
572      """ 
573      tag_accum = [] 
574      cur_word = None 
575      result = [] 
576      for chunk in chunks: 
577          if isinstance(chunk, tuple): 
578              if chunk[0] == 'img': 
579                  src = chunk[1] 
580                  tag = chunk[2] 
581                  if tag.endswith(' '): 
582                      tag = tag[:-1] 
583                      trailing_whitespace = True 
584                  else: 
585                      trailing_whitespace = False 
586                  cur_word = tag_token('img', src, html_repr=tag, 
587                                       pre_tags=tag_accum, 
588                                       trailing_whitespace=trailing_whitespace) 
589                  tag_accum = [] 
590                  result.append(cur_word) 
591              elif chunk[0] == 'href': 
592                  href = chunk[1] 
593                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 
594                  tag_accum = [] 
595                  result.append(cur_word) 
596              continue 
597          if is_word(chunk): 
598              if chunk.endswith(' '): 
599                  chunk = chunk[:-1] 
600                  trailing_whitespace = True 
601              else: 
602                  trailing_whitespace = False 
603              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
604              tag_accum = [] 
605              result.append(cur_word) 
606          elif is_start_tag(chunk): 
607              tag_accum.append(chunk) 
608          elif is_end_tag(chunk): 
609              if tag_accum: 
610                  tag_accum.append(chunk) 
611              else: 
612                  assert cur_word, ( 
613                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
614                      % (cur_word, result, chunk, chunks)) 
615                  cur_word.post_tags.append(chunk) 
616          else: 
617              assert(0) 
618   
619      if not result: 
620          return [token('', pre_tags=tag_accum)] 
621      else: 
622          result[-1].post_tags.extend(tag_accum) 
623   
624      return result 
625   
626   
627  # All the tags in HTML that don't require end tags: 
628  empty_tags = ( 
629      'param', 'img', 'area', 'br', 'basefont', 'input', 
630      'base', 'meta', 'link', 'col') 
631   
632  block_level_tags = ( 
633      'address', 
634      'blockquote', 
635      'center', 
636      'dir', 
637      'div', 
638      'dl', 
639      'fieldset', 
640      'form', 
641      'h1', 
642      'h2', 
643      'h3', 
644      'h4', 
645      'h5', 
646      'h6', 
647      'hr', 
648      'isindex', 
649      'menu', 
650      'noframes', 
651      'noscript', 
652      'ol', 
653      'p', 
654      'pre', 
655      'table', 
656      'ul', 
657      ) 
658   
659  block_level_container_tags = ( 
660      'dd', 
661      'dt', 
662      'frameset', 
663      'li', 
664      'tbody', 
665      'td', 
666      'tfoot', 
667      'th', 
668      'thead', 
669      'tr', 
670      ) 
671   
672   
673 -def flatten_el(el, include_hrefs, skip_tag=False): 
674      """ Takes an lxml element el, and generates all the text chunks for 
675      that tag.  Each start tag is a chunk, each word is a chunk, and each 
676      end tag is a chunk. 
677   
678      If skip_tag is true, then the outermost container tag is 
679      not returned (just its contents).""" 
680      if not skip_tag: 
681          if el.tag == 'img': 
682              yield ('img', el.attrib['src'], start_tag(el)) 
683          else: 
684              yield start_tag(el) 
685      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
686          return 
687      start_words = split_words(el.text) 
688      for word in start_words: 
689          yield cgi.escape(word) 
690      for child in el: 
691          for item in flatten_el(child, include_hrefs=include_hrefs): 
692              yield item 
693      if el.tag == 'a' and el.attrib.get('href') and include_hrefs: 
694          yield ('href', el.attrib['href']) 
695      if not skip_tag: 
696          yield end_tag(el) 
697          end_words = split_words(el.tail) 
698          for word in end_words: 
699              yield cgi.escape(word) 
700   
701 -def split_words(text): 
702      """ Splits some text into words. Includes trailing whitespace (one 
703      space) on each word when appropriate.  """ 
704      if not text or not text.strip(): 
705          return [] 
706      words = [w + ' ' for w in text.strip().split()] 
707      if not end_whitespace_re.search(text): 
708          words[-1] = words[-1][:-1] 
709      return words 
710   
711  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
712   
713 -def start_tag(el): 
714      """ 
715      The text representation of the start tag for a tag. 
716      """ 
717      return '<%s%s>' % ( 
718          el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) 
719                           for name, value in el.attrib.items()])) 
720   
721 -def end_tag(el): 
722      """ The text representation of an end tag for a tag.  Includes 
723      trailing whitespace when appropriate.  """ 
724      if el.tail and start_whitespace_re.search(el.tail): 
725          extra = ' ' 
726      else: 
727          extra = '' 
728      return '</%s>%s' % (el.tag, extra) 
729   
730 -def is_word(tok): 
731      return not tok.startswith('<') 
732   
733 -def is_end_tag(tok): 
734      return tok.startswith('</') 
735   
736 -def is_start_tag(tok): 
737      return tok.startswith('<') and not tok.startswith('</') 
738   
739 -def fixup_ins_del_tags(html): 
740      """ Given an html string, move any <ins> or <del> tags inside of any 
741      block-level elements, e.g. transform <ins><p>word</p></ins> to 
742      <p><ins>word</ins></p> """ 
743      doc = parse_html(html, cleanup=False) 
744      _fixup_ins_del_tags(doc) 
745      html = serialize_html_fragment(doc, skip_outer=True) 
746      return html 
747   
748 -def serialize_html_fragment(el, skip_outer=False): 
749      """ Serialize a single lxml element as HTML.  The serialized form 
750      includes the elements tail.   
751   
752      If skip_outer is true, then don't serialize the outermost tag 
753      """ 
754      assert not isinstance(el, basestring), ( 
755          "You should pass in an element, not a string like %r" % el) 
756      html = etree.tostring(el, method="html", encoding=_unicode) 
757      if skip_outer: 
758          # Get rid of the extra starting tag: 
759          html = html[html.find('>')+1:] 
760          # Get rid of the extra end tag: 
761          html = html[:html.rfind('<')] 
762          return html.strip() 
763      else: 
764          return html 
765   
766 -def _fixup_ins_del_tags(doc): 
767      """fixup_ins_del_tags that works on an lxml document in-place 
768      """ 
769      for tag in ['ins', 'del']: 
770          for el in doc.xpath('descendant-or-self::%s' % tag): 
771              if not _contains_block_level_tag(el): 
772                  continue 
773              _move_el_inside_block(el, tag=tag) 
774              el.drop_tag() 
775              #_merge_element_contents(el) 
776   
777 -def _contains_block_level_tag(el): 
778      """True if the element contains any block-level elements, like <p>, <td>, etc. 
779      """ 
780      if el.tag in block_level_tags or el.tag in block_level_container_tags: 
781          return True 
782      for child in el: 
783          if _contains_block_level_tag(child): 
784              return True 
785      return False 
786   
787 -def _move_el_inside_block(el, tag): 
788      """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 
789      and moves them inside any block-level tags.  """ 
790      for child in el: 
791          if _contains_block_level_tag(child): 
792              break 
793      else: 
794          import sys 
795          # No block-level tags in any child 
796          children_tag = etree.Element(tag) 
797          children_tag.text = el.text 
798          el.text = None 
799          children_tag.extend(list(el)) 
800          el[:] = [children_tag] 
801          return 
802      for child in list(el): 
803          if _contains_block_level_tag(child): 
804              _move_el_inside_block(child, tag) 
805              if child.tail: 
806                  tail_tag = etree.Element(tag) 
807                  tail_tag.text = child.tail 
808                  child.tail = None 
809                  el.insert(el.index(child)+1, tail_tag) 
810          else: 
811              child_tag = etree.Element(tag) 
812              el.replace(child, child_tag) 
813              child_tag.append(child) 
814      if el.text: 
815          text_tag = etree.Element(tag) 
816          text_tag.text = el.text 
817          el.text = None 
818          el.insert(0, text_tag) 
819               
820 -def _merge_element_contents(el): 
821      """ 
822      Removes an element, but merges its contents into its place, e.g., 
823      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
824      <p>Hi there!</p> 
825      """ 
826      parent = el.getparent() 
827      text = el.text or '' 
828      if el.tail: 
829          if not len(el): 
830              text += el.tail 
831          else: 
832              if el[-1].tail: 
833                  el[-1].tail += el.tail 
834              else: 
835                  el[-1].tail = el.tail 
836      index = parent.index(el) 
837      if text: 
838          if index == 0: 
839              previous = None 
840          else: 
841              previous = parent[index-1] 
842          if previous is None: 
843              if parent.text: 
844                  parent.text += text 
845              else: 
846                  parent.text = text 
847          else: 
848              if previous.tail: 
849                  previous.tail += text 
850              else: 
851                  previous.tail = text 
852      parent[index:index+1] = el.getchildren() 
853   
854 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): 
855      """ 
856      Acts like SequenceMatcher, but tries not to find very small equal 
857      blocks amidst large spans of changes 
858      """ 
859   
860      threshold = 2 
861       
862 -    def get_matching_blocks(self): 
863          size = min(len(self.b), len(self.b)) 
864          threshold = min(self.threshold, size / 4) 
865          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
866          return [item for item in actual 
867                  if item[2] > threshold 
868                  or not item[2]] 
869   
870  if __name__ == '__main__': 
871      from lxml.html import _diffcommand 
872      _diffcommand.main() 
873