Package lxml :: Package html :: Module diff
Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import cgi 
  5  import re 
  6   
  7  __all__ = ['html_annotate', 'htmldiff'] 
  8   
  9   
 10  ############################################################ 
 11  ## Annotation 
 12  ############################################################ 
 13   
 14 -def default_markup(text, version): 
 15      return '<span title="%s">%s</span>' % ( 
 16          cgi.escape(unicode(version), 1), text) 
 17   
 18 -def html_annotate(doclist, markup=default_markup): 
 19      """ 
 20      doclist should be ordered from oldest to newest, like:: 
 21   
 22          >>> version1 = 'Hello World' 
 23          >>> version2 = 'Goodbye World' 
 24          >>> html_annotate([(version1, 'version 1'), 
 25          ...                (version2, 'version 2')]) 
 26          u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>' 
 27   
 28      The documents must be *fragments* (str/UTF8 or unicode), not 
 29      complete documents 
 30   
 31      The markup argument is a function to markup the spans of words. 
 32      This function is called like markup('Hello', 'version 2'), and 
 33      returns HTML.  The first argument is text and never includes any 
 34      markup.  The default uses a span with a title: 
 35   
 36          >>> default_markup('Some Text', 'by Joe') 
 37          u'<span title="by Joe">Some Text</span>' 
 38      """ 
 39      # The basic strategy we have is to split the documents up into 
 40      # logical tokens (which are words with attached markup).  We then 
 41      # do diffs of each of the versions to track when a token first 
 42      # appeared in the document; the annotation attached to the token 
 43      # is the version where it first appeared. 
 44      tokenlist = [tokenize_annotated(doc, version) 
 45                   for doc, version in doclist] 
 46      cur_tokens = tokenlist[0] 
 47      for tokens in tokenlist[1:]: 
 48          html_annotate_merge_annotations(cur_tokens, tokens) 
 49          cur_tokens = tokens 
 50   
 51      # After we've tracked all the tokens, we can combine spans of text 
 52      # that are adjacent and have the same annotation 
 53      cur_tokens = compress_tokens(cur_tokens) 
 54      # And finally add markup 
 55      result = markup_serialize_tokens(cur_tokens, markup) 
 56      return ''.join(result).strip() 
 57   
 58 -def tokenize_annotated(doc, annotation):  
 59      """Tokenize a document and add an annotation attribute to each token 
 60      """ 
 61      tokens = tokenize(doc, include_hrefs=False) 
 62      for tok in tokens:  
 63          tok.annotation = annotation 
 64      return tokens 
 65   
 66 -def html_annotate_merge_annotations(tokens_old, tokens_new):  
 67      """Merge the annotations from tokens_old into tokens_new, when the 
 68      tokens in the new document already existed in the old document. 
 69      """ 
 70      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 71      commands = s.get_opcodes() 
 72   
 73      for command, i1, i2, j1, j2 in commands: 
 74          if command == 'equal':  
 75              eq_old = tokens_old[i1:i2] 
 76              eq_new = tokens_new[j1:j2] 
 77              copy_annotations(eq_old, eq_new) 
 78   
 79 -def copy_annotations(src, dest):  
 80      """ 
 81      Copy annotations from the tokens listed in src to the tokens in dest 
 82      """ 
 83      assert len(src) == len(dest) 
 84      for src_tok, dest_tok in zip(src, dest):  
 85          dest_tok.annotation = src_tok.annotation 
 86   
 87 -def compress_tokens(tokens): 
 88      """ 
 89      Combine adjacent tokens when there is no HTML between the tokens,  
 90      and they share an annotation 
 91      """ 
 92      result = [tokens[0]]  
 93      for tok in tokens[1:]:  
 94          if (not result[-1].post_tags and  
 95              not tok.pre_tags and  
 96              result[-1].annotation == tok.annotation):  
 97              compress_merge_back(result, tok) 
 98          else:  
 99              result.append(tok) 
100      return result 
101   
102 -def compress_merge_back(tokens, tok):  
103      """ Merge tok into the last element of tokens (modifying the list of 
104      tokens in-place).  """ 
105      last = tokens[-1] 
106      if type(last) is not token or type(tok) is not token:  
107          tokens.append(tok) 
108      else: 
109          text = unicode(last) 
110          if last.trailing_whitespace: 
111              text += ' ' 
112          text += tok 
113          merged = token(text, 
114                         pre_tags=last.pre_tags, 
115                         post_tags=tok.post_tags, 
116                         trailing_whitespace=tok.trailing_whitespace) 
117          merged.annotation = last.annotation 
118          tokens[-1] = merged 
119       
120 -def markup_serialize_tokens(tokens, markup_func): 
121      """ 
122      Serialize the list of tokens into a list of text chunks, calling 
123      markup_func around text to add annotations. 
124      """ 
125      for token in tokens: 
126          for pre in token.pre_tags: 
127              yield pre 
128          html = token.html() 
129          html = markup_func(html, token.annotation) 
130          if token.trailing_whitespace: 
131              html += ' ' 
132          yield html 
133          for post in token.post_tags: 
134              yield post 
135   
136   
137  ############################################################ 
138  ## HTML Diffs 
139  ############################################################ 
140   
141 -def htmldiff(old_html, new_html): 
142      ## FIXME: this should take parsed documents too, and use their body 
143      ## or other content. 
144      """ Do a diff of the old and new document.  The documents are HTML 
145      *fragments* (str/UTF8 or unicode), they are not complete documents 
146      (i.e., no <html> tag). 
147   
148      Returns HTML with <ins> and <del> tags added around the 
149      appropriate text.   
150   
151      Markup is generally ignored, with the markup from new_html 
152      preserved, and possibly some markup from old_html (though it is 
153      considered acceptable to lose some of the old markup).  Only the 
154      words in the HTML are diffed.  The exception is <img> tags, which 
155      are treated like words, and the href attribute of <a> tags, which 
156      are noted inside the tag itself when there are changes. 
157      """  
158      old_html_tokens = tokenize(old_html) 
159      new_html_tokens = tokenize(new_html) 
160      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
161      result = ''.join(result).strip() 
162      return fixup_ins_del_tags(result) 
163   
164 -def htmldiff_tokens(html1_tokens, html2_tokens): 
165      """ Does a diff on the tokens themselves, returning a list of text 
166      chunks (not tokens). 
167      """ 
168      # There are several passes as we do the differences.  The tokens 
169      # isolate the portion of the content we care to diff; difflib does 
170      # all the actual hard work at that point.   
171      # 
172      # Then we must create a valid document from pieces of both the old 
173      # document and the new document.  We generally prefer to take 
174      # markup from the new document, and only do a best effort attempt 
175      # to keep markup from the old document; anything that we can't 
176      # resolve we throw away.  Also we try to put the deletes as close 
177      # to the location where we think they would have been -- because 
178      # we are only keeping the markup from the new document, it can be 
179      # fuzzy where in the new document the old text would have gone. 
180      # Again we just do a best effort attempt. 
181      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
182      commands = s.get_opcodes() 
183      result = [] 
184      for command, i1, i2, j1, j2 in commands: 
185          if command == 'equal': 
186              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
187              continue 
188          if command == 'insert' or command == 'replace': 
189              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
190              merge_insert(ins_tokens, result) 
191          if command == 'delete' or command == 'replace': 
192              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
193              merge_delete(del_tokens, result) 
194      # If deletes were inserted directly as <del> then we'd have an 
195      # invalid document at this point.  Instead we put in special 
196      # markers, and when the complete diffed document has been created 
197      # we try to move the deletes around and resolve any problems. 
198      result = cleanup_delete(result) 
199   
200      return result 
201   
202 -def expand_tokens(tokens, equal=False): 
203      """Given a list of tokens, return a generator of the chunks of 
204      text for the data in the tokens. 
205      """ 
206      for token in tokens: 
207          for pre in token.pre_tags: 
208              yield pre 
209          if not equal or not token.hide_when_equal: 
210              if token.trailing_whitespace: 
211                  yield token.html() + ' ' 
212              else: 
213                  yield token.html() 
214          for post in token.post_tags: 
215              yield post 
216   
217 -def merge_insert(ins_chunks, doc): 
218      """ doc is the already-handled document (as a list of text chunks); 
219      here we add <ins>ins_chunks</ins> to the end of that.  """ 
220      # Though we don't throw away unbalanced_start or unbalanced_end 
221      # (we assume there is accompanying markup later or earlier in the 
222      # document), we only put <ins> around the balanced portion. 
223      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
224      doc.extend(unbalanced_start) 
225      if doc and not doc[-1].endswith(' '): 
226          # Fix up the case where the word before the insert didn't end with  
227          # a space 
228          doc[-1] += ' ' 
229      doc.append('<ins>') 
230      if balanced and balanced[-1].endswith(' '): 
231          # We move space outside of </ins> 
232          balanced[-1] = balanced[-1][:-1] 
233      doc.extend(balanced) 
234      doc.append('</ins> ') 
235      doc.extend(unbalanced_end) 
236   
237  # These are sentinals to represent the start and end of a <del> 
238  # segment, until we do the cleanup phase to turn them into proper 
239  # markup: 
240 -class DEL_START: 
241      pass 
242 -class DEL_END: 
243      pass 
244   
245 -class NoDeletes(Exception): 
246      """ Raised when the document no longer contains any pending deletes 
247      (DEL_START/DEL_END) """ 
248   
249 -def merge_delete(del_chunks, doc): 
250      """ Adds the text chunks in del_chunks to the document doc (another 
251      list of text chunks) with marker to show it is a delete. 
252      cleanup_delete later resolves these markers into <del> tags.""" 
253      doc.append(DEL_START) 
254      doc.extend(del_chunks) 
255      doc.append(DEL_END) 
256   
257 -def cleanup_delete(chunks): 
258      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
259      them with <del></del>.  To do this while keeping the document 
260      valid, it may need to drop some tags (either start or end tags). 
261   
262      It may also move the del into adjacent tags to try to move it to a 
263      similar location where it was originally located (e.g., moving a 
264      delete into preceding <div> tag, if the del looks like (DEL_START, 
265      'Text</div>', DEL_END)""" 
266      while 1: 
267          # Find a pending DEL_START/DEL_END, splitting the document 
268          # into stuff-preceding-DEL_START, stuff-inside, and 
269          # stuff-following-DEL_END 
270          try: 
271              pre_delete, delete, post_delete = split_delete(chunks) 
272          except NoDeletes: 
273              # Nothing found, we've cleaned up the entire doc 
274              break 
275          # The stuff-inside-DEL_START/END may not be well balanced 
276          # markup.  First we figure out what unbalanced portions there are: 
277          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
278          # Then we move the span forward and/or backward based on these 
279          # unbalanced portions: 
280          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
281          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
282          doc = pre_delete 
283          if doc and not doc[-1].endswith(' '): 
284              # Fix up case where the word before us didn't have a trailing space 
285              doc[-1] += ' ' 
286          doc.append('<del>') 
287          if balanced and balanced[-1].endswith(' '): 
288              # We move space outside of </del> 
289              balanced[-1] = balanced[-1][:-1] 
290          doc.extend(balanced) 
291          doc.append('</del> ') 
292          doc.extend(post_delete) 
293          chunks = doc 
294      return chunks 
295   
296 -def split_unbalanced(chunks): 
297      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
298      a list of text and tag chunks. 
299   
300      unbalanced_start is a list of all the tags that are opened, but 
301      not closed in this span.  Similarly, unbalanced_end is a list of 
302      tags that are closed but were not opened.  Extracting these might 
303      mean some reordering of the chunks.""" 
304      start = [] 
305      end = [] 
306      tag_stack = [] 
307      balanced = [] 
308      for chunk in chunks: 
309          if not chunk.startswith('<'): 
310              balanced.append(chunk) 
311              continue 
312          endtag = chunk[1] == '/' 
313          name = chunk.split()[0].strip('<>/') 
314          if name in empty_tags: 
315              balanced.append(chunk) 
316              continue 
317          if endtag: 
318              if tag_stack and tag_stack[-1][0] == name: 
319                  balanced.append(chunk) 
320                  name, pos, tag = tag_stack.pop() 
321                  balanced[pos] = tag 
322              elif tag_stack: 
323                  start.extend([tag for name, pos, tag in tag_stack]) 
324                  tag_stack = [] 
325                  end.append(chunk) 
326              else: 
327                  end.append(chunk) 
328          else: 
329              tag_stack.append((name, len(balanced), chunk)) 
330              balanced.append(None) 
331      start.extend( 
332          [chunk for name, pos, chunk in tag_stack]) 
333      balanced = [chunk for chunk in balanced if chunk is not None] 
334      return start, balanced, end 
335   
336 -def split_delete(chunks): 
337      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
338      stuff_after_DEL_END).  Returns the first case found (there may be 
339      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
340      there's no DEL_START found. """ 
341      try: 
342          pos = chunks.index(DEL_START) 
343      except ValueError: 
344          raise NoDeletes 
345      pos2 = chunks.index(DEL_END) 
346      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
347   
348 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): 
349      """ pre_delete and post_delete implicitly point to a place in the 
350      document (where the two were split).  This moves that point (by 
351      popping items from one and pushing them onto the other).  It moves 
352      the point to try to find a place where unbalanced_start applies. 
353   
354      As an example:: 
355   
356          >>> unbalanced_start = ['<div>'] 
357          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
358          >>> pre, post = doc[:3], doc[3:] 
359          >>> pre, post 
360          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
361          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
362          >>> pre, post 
363          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
364   
365      As you can see, we moved the point so that the dangling <div> that 
366      we found will be effectively replaced by the div in the original 
367      document.  If this doesn't work out, we just throw away 
368      unbalanced_start without doing anything. 
369      """ 
370      while 1: 
371          if not unbalanced_start: 
372              # We have totally succeded in finding the position 
373              break 
374          finding = unbalanced_start[0] 
375          finding_name = finding.split()[0].strip('<>') 
376          if not post_delete: 
377              break 
378          next = post_delete[0] 
379          if next is DEL_START or not next.startswith('<'): 
380              # Reached a word, we can't move the delete text forward 
381              break 
382          if next[1] == '/': 
383              # Reached a closing tag, can we go further?  Maybe not... 
384              break 
385          name = next.split()[0].strip('<>') 
386          if name == 'ins': 
387              # Can't move into an insert 
388              break 
389          assert name != 'del', ( 
390              "Unexpected delete tag: %r" % next) 
391          if name == finding_name: 
392              unbalanced_start.pop(0) 
393              pre_delete.append(post_delete.pop(0)) 
394          else: 
395              # Found a tag that doesn't match 
396              break 
397   
398 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): 
399      """ like locate_unbalanced_start, except handling end tags and 
400      possibly moving the point earlier in the document.  """ 
401      while 1: 
402          if not unbalanced_end: 
403              # Success 
404              break 
405          finding = unbalanced_end[-1] 
406          finding_name = finding.split()[0].strip('<>/') 
407          if not pre_delete: 
408              break 
409          next = pre_delete[-1] 
410          if next is DEL_END or not next.startswith('</'): 
411              # A word or a start tag 
412              break 
413          name = next.split()[0].strip('<>/') 
414          if name == 'ins' or name == 'del': 
415              # Can't move into an insert or delete 
416              break 
417          if name == finding_name: 
418              unbalanced_end.pop() 
419              post_delete.insert(0, pre_delete.pop()) 
420          else: 
421              # Found a tag that doesn't match 
422              break 
423   
424 -class token(unicode): 
425      """ Represents a diffable token, generally a word that is displayed to 
426      the user.  Opening tags are attached to this token when they are 
427      adjacent (pre_tags) and closing tags that follow the word 
428      (post_tags).  Some exceptions occur when there are empty tags 
429      adjacent to a word, so there may be close tags in pre_tags, or 
430      open tags in post_tags. 
431   
432      We also keep track of whether the word was originally followed by 
433      whitespace, even though we do not want to treat the word as 
434      equivalent to a similar word that does not have a trailing 
435      space.""" 
436   
437      # When this is true, the token will be eliminated from the 
438      # displayed diff if no change has occurred: 
439      hide_when_equal = False 
440   
441 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): 
442          obj = unicode.__new__(cls, text) 
443   
444          if pre_tags is not None: 
445              obj.pre_tags = pre_tags 
446          else: 
447              obj.pre_tags = [] 
448   
449          if post_tags is not None: 
450              obj.post_tags = post_tags 
451          else: 
452              obj.post_tags = [] 
453   
454          obj.trailing_whitespace = trailing_whitespace 
455   
456          return obj 
457   
458 -    def __repr__(self): 
459          return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags) 
460   
461 -    def html(self): 
462          return unicode(self) 
463   
464 -class tag_token(token): 
465   
466      """ Represents a token that is actually a tag.  Currently this is just 
467      the <img> tag, which takes up visible space just like a word but 
468      is only represented in a document by a tag.  """ 
469   
470 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
471                  post_tags=None, trailing_whitespace=False): 
472          obj = token.__new__(cls, "%s: %s" % (type, data),  
473                              pre_tags=pre_tags,  
474                              post_tags=post_tags,  
475                              trailing_whitespace=trailing_whitespace) 
476          obj.tag = tag 
477          obj.data = data 
478          obj.html_repr = html_repr 
479          return obj 
480   
481 -    def __repr__(self): 
482          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 
483              self.tag,  
484              self.data,  
485              self.html_repr,  
486              self.pre_tags,  
487              self.post_tags,  
488              self.trailing_whitespace) 
489 -    def html(self): 
490          return self.html_repr 
491   
492 -class href_token(token): 
493   
494      """ Represents the href in an anchor tag.  Unlike other words, we only 
495      show the href when it changes.  """ 
496   
497      hide_when_equal = True 
498   
499 -    def html(self): 
500          return 'Link: %s' % self 
501   
502 -def tokenize(html, include_hrefs=True): 
503      """ 
504      Parse the given HTML and returns token objects (words with attached tags). 
505   
506      This parses only the content of a page; anything in the head is 
507      ignored, and the <head> and <body> elements are themselves 
508      optional.  The content is then parsed by lxml, which ensures the 
509      validity of the resulting parsed document (though lxml may make 
510      incorrect guesses when the markup is particular bad). 
511   
512      <ins> and <del> tags are also eliminated from the document, as 
513      that gets confusing. 
514   
515      If include_hrefs is true, then the href attribute of <a> tags is 
516      included as a special kind of diffable token.""" 
517      body_el = parse_html(html, cleanup=True) 
518      # Then we split the document into text chunks for each tag, word, and end tag: 
519      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
520      # Finally re-joining them into token objects: 
521      return fixup_chunks(chunks) 
522   
523 -def parse_html(html, cleanup=True): 
524      """ 
525      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
526      wrapped in a <div> tag that was not in the original document. 
527   
528      If cleanup is true, make sure there's no <head> or <body>, and get 
529      rid of any <ins> and <del> tags. 
530      """ 
531      if cleanup: 
532          # This removes any extra markup or structure like <head>: 
533          html = cleanup_html(html) 
534      return fragment_fromstring(html, create_parent=True) 
535   
536  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
537  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
538  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
539   
540 -def cleanup_html(html): 
541      """ This 'cleans' the HTML, meaning that any page structure is removed 
542      (only the contents of <body> are used, if there is any <body). 
543      Also <ins> and <del> tags are removed.  """ 
544      match = _body_re.search(html) 
545      if match: 
546          html = html[match.end():] 
547      match = _end_body_re.search(html) 
548      if match: 
549          html = html[:match.start()] 
550      html = _ins_del_re.sub('', html) 
551      return html 
552       
553   
554  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
555   
556 -def fixup_chunks(chunks): 
557      """ 
558      This function takes a list of chunks and produces a list of tokens. 
559      """ 
560      tag_accum = [] 
561      cur_word = None 
562      result = [] 
563      for chunk in chunks: 
564          if isinstance(chunk, tuple): 
565              if chunk[0] == 'img': 
566                  src = chunk[1] 
567                  tag = chunk[2] 
568                  if tag.endswith(' '): 
569                      tag = tag[:-1] 
570                      trailing_whitespace = True 
571                  else: 
572                      trailing_whitespace = False 
573                  cur_word = tag_token('img', src, html_repr=tag, 
574                                       pre_tags=tag_accum, 
575                                       trailing_whitespace=trailing_whitespace) 
576                  tag_accum = [] 
577                  result.append(cur_word) 
578              elif chunk[0] == 'href': 
579                  href = chunk[1] 
580                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 
581                  tag_accum = [] 
582                  result.append(cur_word) 
583              continue 
584          if is_word(chunk): 
585              if chunk.endswith(' '): 
586                  chunk = chunk[:-1] 
587                  trailing_whitespace = True 
588              else: 
589                  trailing_whitespace = False 
590              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
591              tag_accum = [] 
592              result.append(cur_word) 
593          elif is_start_tag(chunk): 
594              tag_accum.append(chunk) 
595          elif is_end_tag(chunk): 
596              if tag_accum: 
597                  tag_accum.append(chunk) 
598              else: 
599                  assert cur_word, ( 
600                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
601                      % (cur_word, result, chunk, chunks)) 
602                  cur_word.post_tags.append(chunk) 
603          else: 
604              assert(0) 
605   
606      if not result: 
607          return [token('', pre_tags=tag_accum)] 
608      else: 
609          result[-1].post_tags.extend(tag_accum) 
610   
611      return result 
612   
613   
614  # All the tags in HTML that don't require end tags: 
615  empty_tags = ( 
616      'param', 'img', 'area', 'br', 'basefont', 'input', 
617      'base', 'meta', 'link', 'col') 
618   
619  block_level_tags = ( 
620      'address', 
621      'blockquote', 
622      'center', 
623      'dir', 
624      'div', 
625      'dl', 
626      'fieldset', 
627      'form', 
628      'h1', 
629      'h2', 
630      'h3', 
631      'h4', 
632      'h5', 
633      'h6', 
634      'hr', 
635      'isindex', 
636      'menu', 
637      'noframes', 
638      'noscript', 
639      'ol', 
640      'p', 
641      'pre', 
642      'table', 
643      'ul', 
644      ) 
645   
646  block_level_container_tags = ( 
647      'dd', 
648      'dt', 
649      'frameset', 
650      'li', 
651      'tbody', 
652      'td', 
653      'tfoot', 
654      'th', 
655      'thead', 
656      'tr', 
657      ) 
658   
659   
660 -def flatten_el(el, include_hrefs, skip_tag=False): 
661      """ Takes an lxml element el, and generates all the text chunks for 
662      that tag.  Each start tag is a chunk, each word is a chunk, and each 
663      end tag is a chunk. 
664   
665      If skip_tag is true, then the outermost container tag is 
666      not returned (just its contents).""" 
667      if not skip_tag: 
668          if el.tag == 'img': 
669              yield ('img', el.attrib['src'], start_tag(el)) 
670          else: 
671              yield start_tag(el) 
672      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
673          return 
674      start_words = split_words(el.text) 
675      for word in start_words: 
676          yield cgi.escape(word) 
677      for child in el: 
678          for item in flatten_el(child, include_hrefs=include_hrefs): 
679              yield item 
680      if el.tag == 'a' and el.attrib.get('href') and include_hrefs: 
681          yield ('href', el.attrib['href']) 
682      if not skip_tag: 
683          yield end_tag(el) 
684          end_words = split_words(el.tail) 
685          for word in end_words: 
686              yield cgi.escape(word) 
687   
688 -def split_words(text): 
689      """ Splits some text into words. Includes trailing whitespace (one 
690      space) on each word when appropriate.  """ 
691      if not text or not text.strip(): 
692          return [] 
693      words = [w + ' ' for w in text.strip().split()] 
694      if not end_whitespace_re.search(text): 
695          words[-1] = words[-1][:-1] 
696      return words 
697   
698  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
699   
700 -def start_tag(el): 
701      """ 
702      The text representation of the start tag for a tag. 
703      """ 
704      return '<%s%s>' % ( 
705          el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) 
706                           for name, value in el.attrib.items()])) 
707   
708 -def end_tag(el): 
709      """ The text representation of an end tag for a tag.  Includes 
710      trailing whitespace when appropriate.  """ 
711      if el.tail and start_whitespace_re.search(el.tail): 
712          extra = ' ' 
713      else: 
714          extra = '' 
715      return '</%s>%s' % (el.tag, extra) 
716   
717 -def is_word(tok): 
718      return not tok.startswith('<') 
719   
720 -def is_end_tag(tok): 
721      return tok.startswith('</') 
722   
723 -def is_start_tag(tok): 
724      return tok.startswith('<') and not tok.startswith('</') 
725   
726 -def fixup_ins_del_tags(html): 
727      """ Given an html string, move any <ins> or <del> tags inside of any 
728      block-level elements, e.g. transform <ins><p>word</p></ins> to 
729      <p><ins>word</ins></p> """ 
730      doc = parse_html(html, cleanup=False) 
731      _fixup_ins_del_tags(doc) 
732      html = serialize_html_fragment(doc, skip_outer=True) 
733      return html 
734   
735 -def serialize_html_fragment(el, skip_outer=False): 
736      """ Serialize a single lxml element as HTML.  The serialized form 
737      includes the elements tail.   
738   
739      If skip_outer is true, then don't serialize the outermost tag 
740      """ 
741      assert not isinstance(el, basestring), ( 
742          "You should pass in an element, not a string like %r" % el) 
743      html = etree.tostring(el, method="html", encoding="UTF-8") 
744      if skip_outer: 
745          # Get rid of the extra starting tag: 
746          html = html[html.find('>')+1:] 
747          # Get rid of the extra end tag: 
748          html = html[:html.rfind('<')] 
749          return html.strip() 
750      else: 
751          return html 
752   
753 -def _fixup_ins_del_tags(doc): 
754      """fixup_ins_del_tags that works on an lxml document in-place 
755      """ 
756      for tag in ['ins', 'del']: 
757          for el in doc.xpath('descendant-or-self::%s' % tag): 
758              if not _contains_block_level_tag(el): 
759                  continue 
760              _move_el_inside_block(el, tag=tag) 
761              el.drop_tag() 
762              #_merge_element_contents(el) 
763   
764 -def _contains_block_level_tag(el): 
765      """True if the element contains any block-level elements, like <p>, <td>, etc. 
766      """ 
767      if el.tag in block_level_tags or el.tag in block_level_container_tags: 
768          return True 
769      for child in el: 
770          if _contains_block_level_tag(child): 
771              return True 
772      return False 
773   
774 -def _move_el_inside_block(el, tag): 
775      """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 
776      and moves them inside any block-level tags.  """ 
777      for child in el: 
778          if _contains_block_level_tag(child): 
779              break 
780      else: 
781          import sys 
782          # No block-level tags in any child 
783          children_tag = etree.Element(tag) 
784          children_tag.text = el.text 
785          el.text = None 
786          children_tag.extend(list(el)) 
787          el[:] = [children_tag] 
788          return 
789      for child in list(el): 
790          if _contains_block_level_tag(child): 
791              _move_el_inside_block(child, tag) 
792              if child.tail: 
793                  tail_tag = etree.Element(tag) 
794                  tail_tag.text = child.tail 
795                  child.tail = None 
796                  el.insert(el.index(child)+1, tail_tag) 
797          else: 
798              child_tag = etree.Element(tag) 
799              el.replace(child, child_tag) 
800              child_tag.append(child) 
801      if el.text: 
802          text_tag = etree.Element(tag) 
803          text_tag.text = el.text 
804          el.text = None 
805          el.insert(0, text_tag) 
806               
807 -def _merge_element_contents(el): 
808      """ 
809      Removes an element, but merges its contents into its place, e.g., 
810      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
811      <p>Hi there!</p> 
812      """ 
813      parent = el.getparent() 
814      text = el.text or '' 
815      if el.tail: 
816          if not len(el): 
817              text += el.tail 
818          else: 
819              if el[-1].tail: 
820                  el[-1].tail += el.tail 
821              else: 
822                  el[-1].tail = el.tail 
823      index = parent.index(el) 
824      if text: 
825          if index == 0: 
826              previous = None 
827          else: 
828              previous = parent[index-1] 
829          if previous is None: 
830              if parent.text: 
831                  parent.text += text 
832              else: 
833                  parent.text = text 
834          else: 
835              if previous.tail: 
836                  previous.tail += text 
837              else: 
838                  previous.tail = text 
839      parent[index:index+1] = el.getchildren() 
840   
841 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): 
842      """ 
843      Acts like SequenceMatcher, but tries not to find very small equal 
844      blocks amidst large spans of changes 
845      """ 
846   
847      threshold = 2 
848       
849 -    def get_matching_blocks(self): 
850          size = min(len(self.b), len(self.b)) 
851          threshold = min(self.threshold, size / 4) 
852          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
853          return [item for item in actual 
854                  if item[2] > threshold 
855                  or not item[2]] 
856   
857  if __name__ == '__main__': 
858      from lxml.html import _diffcommand 
859      _diffcommand.main() 
860