Package lxml :: Package html :: Module diff
Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import re 
  5   
  6  __all__ = ['html_annotate', 'htmldiff'] 
  7   
  8  try: 
  9      from html import escape as html_escape 
 10  except ImportError: 
 11      from cgi import escape as html_escape 
 12  try: 
 13      _unicode = unicode 
 14  except NameError: 
 15      # Python 3 
 16      _unicode = str 
 17  try: 
 18      basestring 
 19  except NameError: 
 20      # Python 3 
 21      basestring = str 
 22   
 23  ############################################################ 
 24  ## Annotation 
 25  ############################################################ 
 26   
 27 -def default_markup(text, version): 
 28      return '<span title="%s">%s</span>' % ( 
 29          html_escape(_unicode(version), 1), text) 
 30   
 31 -def html_annotate(doclist, markup=default_markup): 
 32      """ 
 33      doclist should be ordered from oldest to newest, like:: 
 34   
 35          >>> version1 = 'Hello World' 
 36          >>> version2 = 'Goodbye World' 
 37          >>> print(html_annotate([(version1, 'version 1'), 
 38          ...                      (version2, 'version 2')])) 
 39          <span title="version 2">Goodbye</span> <span title="version 1">World</span> 
 40   
 41      The documents must be *fragments* (str/UTF8 or unicode), not 
 42      complete documents 
 43   
 44      The markup argument is a function to markup the spans of words. 
 45      This function is called like markup('Hello', 'version 2'), and 
 46      returns HTML.  The first argument is text and never includes any 
 47      markup.  The default uses a span with a title: 
 48   
 49          >>> print(default_markup('Some Text', 'by Joe')) 
 50          <span title="by Joe">Some Text</span> 
 51      """ 
 52      # The basic strategy we have is to split the documents up into 
 53      # logical tokens (which are words with attached markup).  We then 
 54      # do diffs of each of the versions to track when a token first 
 55      # appeared in the document; the annotation attached to the token 
 56      # is the version where it first appeared. 
 57      tokenlist = [tokenize_annotated(doc, version) 
 58                   for doc, version in doclist] 
 59      cur_tokens = tokenlist[0] 
 60      for tokens in tokenlist[1:]: 
 61          html_annotate_merge_annotations(cur_tokens, tokens) 
 62          cur_tokens = tokens 
 63   
 64      # After we've tracked all the tokens, we can combine spans of text 
 65      # that are adjacent and have the same annotation 
 66      cur_tokens = compress_tokens(cur_tokens) 
 67      # And finally add markup 
 68      result = markup_serialize_tokens(cur_tokens, markup) 
 69      return ''.join(result).strip() 
 70   
 71 -def tokenize_annotated(doc, annotation):  
 72      """Tokenize a document and add an annotation attribute to each token 
 73      """ 
 74      tokens = tokenize(doc, include_hrefs=False) 
 75      for tok in tokens:  
 76          tok.annotation = annotation 
 77      return tokens 
 78   
 79 -def html_annotate_merge_annotations(tokens_old, tokens_new):  
 80      """Merge the annotations from tokens_old into tokens_new, when the 
 81      tokens in the new document already existed in the old document. 
 82      """ 
 83      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 84      commands = s.get_opcodes() 
 85   
 86      for command, i1, i2, j1, j2 in commands: 
 87          if command == 'equal':  
 88              eq_old = tokens_old[i1:i2] 
 89              eq_new = tokens_new[j1:j2] 
 90              copy_annotations(eq_old, eq_new) 
 91   
 92 -def copy_annotations(src, dest):  
 93      """ 
 94      Copy annotations from the tokens listed in src to the tokens in dest 
 95      """ 
 96      assert len(src) == len(dest) 
 97      for src_tok, dest_tok in zip(src, dest):  
 98          dest_tok.annotation = src_tok.annotation 
 99   
100 -def compress_tokens(tokens): 
101      """ 
102      Combine adjacent tokens when there is no HTML between the tokens,  
103      and they share an annotation 
104      """ 
105      result = [tokens[0]]  
106      for tok in tokens[1:]:  
107          if (not result[-1].post_tags and  
108              not tok.pre_tags and  
109              result[-1].annotation == tok.annotation):  
110              compress_merge_back(result, tok) 
111          else:  
112              result.append(tok) 
113      return result 
114   
115 -def compress_merge_back(tokens, tok):  
116      """ Merge tok into the last element of tokens (modifying the list of 
117      tokens in-place).  """ 
118      last = tokens[-1] 
119      if type(last) is not token or type(tok) is not token:  
120          tokens.append(tok) 
121      else: 
122          text = _unicode(last) 
123          if last.trailing_whitespace: 
124              text += ' ' 
125          text += tok 
126          merged = token(text, 
127                         pre_tags=last.pre_tags, 
128                         post_tags=tok.post_tags, 
129                         trailing_whitespace=tok.trailing_whitespace) 
130          merged.annotation = last.annotation 
131          tokens[-1] = merged 
132       
133 -def markup_serialize_tokens(tokens, markup_func): 
134      """ 
135      Serialize the list of tokens into a list of text chunks, calling 
136      markup_func around text to add annotations. 
137      """ 
138      for token in tokens: 
139          for pre in token.pre_tags: 
140              yield pre 
141          html = token.html() 
142          html = markup_func(html, token.annotation) 
143          if token.trailing_whitespace: 
144              html += ' ' 
145          yield html 
146          for post in token.post_tags: 
147              yield post 
148   
149   
150  ############################################################ 
151  ## HTML Diffs 
152  ############################################################ 
153   
154 -def htmldiff(old_html, new_html): 
155      ## FIXME: this should take parsed documents too, and use their body 
156      ## or other content. 
157      """ Do a diff of the old and new document.  The documents are HTML 
158      *fragments* (str/UTF8 or unicode), they are not complete documents 
159      (i.e., no <html> tag). 
160   
161      Returns HTML with <ins> and <del> tags added around the 
162      appropriate text.   
163   
164      Markup is generally ignored, with the markup from new_html 
165      preserved, and possibly some markup from old_html (though it is 
166      considered acceptable to lose some of the old markup).  Only the 
167      words in the HTML are diffed.  The exception is <img> tags, which 
168      are treated like words, and the href attribute of <a> tags, which 
169      are noted inside the tag itself when there are changes. 
170      """  
171      old_html_tokens = tokenize(old_html) 
172      new_html_tokens = tokenize(new_html) 
173      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
174      result = ''.join(result).strip() 
175      return fixup_ins_del_tags(result) 
176   
177 -def htmldiff_tokens(html1_tokens, html2_tokens): 
178      """ Does a diff on the tokens themselves, returning a list of text 
179      chunks (not tokens). 
180      """ 
181      # There are several passes as we do the differences.  The tokens 
182      # isolate the portion of the content we care to diff; difflib does 
183      # all the actual hard work at that point.   
184      # 
185      # Then we must create a valid document from pieces of both the old 
186      # document and the new document.  We generally prefer to take 
187      # markup from the new document, and only do a best effort attempt 
188      # to keep markup from the old document; anything that we can't 
189      # resolve we throw away.  Also we try to put the deletes as close 
190      # to the location where we think they would have been -- because 
191      # we are only keeping the markup from the new document, it can be 
192      # fuzzy where in the new document the old text would have gone. 
193      # Again we just do a best effort attempt. 
194      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
195      commands = s.get_opcodes() 
196      result = [] 
197      for command, i1, i2, j1, j2 in commands: 
198          if command == 'equal': 
199              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
200              continue 
201          if command == 'insert' or command == 'replace': 
202              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
203              merge_insert(ins_tokens, result) 
204          if command == 'delete' or command == 'replace': 
205              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
206              merge_delete(del_tokens, result) 
207      # If deletes were inserted directly as <del> then we'd have an 
208      # invalid document at this point.  Instead we put in special 
209      # markers, and when the complete diffed document has been created 
210      # we try to move the deletes around and resolve any problems. 
211      result = cleanup_delete(result) 
212   
213      return result 
214   
215 -def expand_tokens(tokens, equal=False): 
216      """Given a list of tokens, return a generator of the chunks of 
217      text for the data in the tokens. 
218      """ 
219      for token in tokens: 
220          for pre in token.pre_tags: 
221              yield pre 
222          if not equal or not token.hide_when_equal: 
223              if token.trailing_whitespace: 
224                  yield token.html() + ' ' 
225              else: 
226                  yield token.html() 
227          for post in token.post_tags: 
228              yield post 
229   
230 -def merge_insert(ins_chunks, doc): 
231      """ doc is the already-handled document (as a list of text chunks); 
232      here we add <ins>ins_chunks</ins> to the end of that.  """ 
233      # Though we don't throw away unbalanced_start or unbalanced_end 
234      # (we assume there is accompanying markup later or earlier in the 
235      # document), we only put <ins> around the balanced portion. 
236      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
237      doc.extend(unbalanced_start) 
238      if doc and not doc[-1].endswith(' '): 
239          # Fix up the case where the word before the insert didn't end with  
240          # a space 
241          doc[-1] += ' ' 
242      doc.append('<ins>') 
243      if balanced and balanced[-1].endswith(' '): 
244          # We move space outside of </ins> 
245          balanced[-1] = balanced[-1][:-1] 
246      doc.extend(balanced) 
247      doc.append('</ins> ') 
248      doc.extend(unbalanced_end) 
249   
250  # These are sentinals to represent the start and end of a <del> 
251  # segment, until we do the cleanup phase to turn them into proper 
252  # markup: 
253 -class DEL_START: 
254      pass 
255 -class DEL_END: 
256      pass 
257   
258 -class NoDeletes(Exception): 
259      """ Raised when the document no longer contains any pending deletes 
260      (DEL_START/DEL_END) """ 
261   
262 -def merge_delete(del_chunks, doc): 
263      """ Adds the text chunks in del_chunks to the document doc (another 
264      list of text chunks) with marker to show it is a delete. 
265      cleanup_delete later resolves these markers into <del> tags.""" 
266      doc.append(DEL_START) 
267      doc.extend(del_chunks) 
268      doc.append(DEL_END) 
269   
270 -def cleanup_delete(chunks): 
271      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
272      them with <del></del>.  To do this while keeping the document 
273      valid, it may need to drop some tags (either start or end tags). 
274   
275      It may also move the del into adjacent tags to try to move it to a 
276      similar location where it was originally located (e.g., moving a 
277      delete into preceding <div> tag, if the del looks like (DEL_START, 
278      'Text</div>', DEL_END)""" 
279      while 1: 
280          # Find a pending DEL_START/DEL_END, splitting the document 
281          # into stuff-preceding-DEL_START, stuff-inside, and 
282          # stuff-following-DEL_END 
283          try: 
284              pre_delete, delete, post_delete = split_delete(chunks) 
285          except NoDeletes: 
286              # Nothing found, we've cleaned up the entire doc 
287              break 
288          # The stuff-inside-DEL_START/END may not be well balanced 
289          # markup.  First we figure out what unbalanced portions there are: 
290          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
291          # Then we move the span forward and/or backward based on these 
292          # unbalanced portions: 
293          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
294          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
295          doc = pre_delete 
296          if doc and not doc[-1].endswith(' '): 
297              # Fix up case where the word before us didn't have a trailing space 
298              doc[-1] += ' ' 
299          doc.append('<del>') 
300          if balanced and balanced[-1].endswith(' '): 
301              # We move space outside of </del> 
302              balanced[-1] = balanced[-1][:-1] 
303          doc.extend(balanced) 
304          doc.append('</del> ') 
305          doc.extend(post_delete) 
306          chunks = doc 
307      return chunks 
308   
309 -def split_unbalanced(chunks): 
310      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
311      a list of text and tag chunks. 
312   
313      unbalanced_start is a list of all the tags that are opened, but 
314      not closed in this span.  Similarly, unbalanced_end is a list of 
315      tags that are closed but were not opened.  Extracting these might 
316      mean some reordering of the chunks.""" 
317      start = [] 
318      end = [] 
319      tag_stack = [] 
320      balanced = [] 
321      for chunk in chunks: 
322          if not chunk.startswith('<'): 
323              balanced.append(chunk) 
324              continue 
325          endtag = chunk[1] == '/' 
326          name = chunk.split()[0].strip('<>/') 
327          if name in empty_tags: 
328              balanced.append(chunk) 
329              continue 
330          if endtag: 
331              if tag_stack and tag_stack[-1][0] == name: 
332                  balanced.append(chunk) 
333                  name, pos, tag = tag_stack.pop() 
334                  balanced[pos] = tag 
335              elif tag_stack: 
336                  start.extend([tag for name, pos, tag in tag_stack]) 
337                  tag_stack = [] 
338                  end.append(chunk) 
339              else: 
340                  end.append(chunk) 
341          else: 
342              tag_stack.append((name, len(balanced), chunk)) 
343              balanced.append(None) 
344      start.extend( 
345          [chunk for name, pos, chunk in tag_stack]) 
346      balanced = [chunk for chunk in balanced if chunk is not None] 
347      return start, balanced, end 
348   
349 -def split_delete(chunks): 
350      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
351      stuff_after_DEL_END).  Returns the first case found (there may be 
352      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
353      there's no DEL_START found. """ 
354      try: 
355          pos = chunks.index(DEL_START) 
356      except ValueError: 
357          raise NoDeletes 
358      pos2 = chunks.index(DEL_END) 
359      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
360   
361 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): 
362      """ pre_delete and post_delete implicitly point to a place in the 
363      document (where the two were split).  This moves that point (by 
364      popping items from one and pushing them onto the other).  It moves 
365      the point to try to find a place where unbalanced_start applies. 
366   
367      As an example:: 
368   
369          >>> unbalanced_start = ['<div>'] 
370          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
371          >>> pre, post = doc[:3], doc[3:] 
372          >>> pre, post 
373          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
374          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
375          >>> pre, post 
376          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
377   
378      As you can see, we moved the point so that the dangling <div> that 
379      we found will be effectively replaced by the div in the original 
380      document.  If this doesn't work out, we just throw away 
381      unbalanced_start without doing anything. 
382      """ 
383      while 1: 
384          if not unbalanced_start: 
385              # We have totally succeded in finding the position 
386              break 
387          finding = unbalanced_start[0] 
388          finding_name = finding.split()[0].strip('<>') 
389          if not post_delete: 
390              break 
391          next = post_delete[0] 
392          if next is DEL_START or not next.startswith('<'): 
393              # Reached a word, we can't move the delete text forward 
394              break 
395          if next[1] == '/': 
396              # Reached a closing tag, can we go further?  Maybe not... 
397              break 
398          name = next.split()[0].strip('<>') 
399          if name == 'ins': 
400              # Can't move into an insert 
401              break 
402          assert name != 'del', ( 
403              "Unexpected delete tag: %r" % next) 
404          if name == finding_name: 
405              unbalanced_start.pop(0) 
406              pre_delete.append(post_delete.pop(0)) 
407          else: 
408              # Found a tag that doesn't match 
409              break 
410   
411 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): 
412      """ like locate_unbalanced_start, except handling end tags and 
413      possibly moving the point earlier in the document.  """ 
414      while 1: 
415          if not unbalanced_end: 
416              # Success 
417              break 
418          finding = unbalanced_end[-1] 
419          finding_name = finding.split()[0].strip('<>/') 
420          if not pre_delete: 
421              break 
422          next = pre_delete[-1] 
423          if next is DEL_END or not next.startswith('</'): 
424              # A word or a start tag 
425              break 
426          name = next.split()[0].strip('<>/') 
427          if name == 'ins' or name == 'del': 
428              # Can't move into an insert or delete 
429              break 
430          if name == finding_name: 
431              unbalanced_end.pop() 
432              post_delete.insert(0, pre_delete.pop()) 
433          else: 
434              # Found a tag that doesn't match 
435              break 
436   
437 -class token(_unicode): 
438      """ Represents a diffable token, generally a word that is displayed to 
439      the user.  Opening tags are attached to this token when they are 
440      adjacent (pre_tags) and closing tags that follow the word 
441      (post_tags).  Some exceptions occur when there are empty tags 
442      adjacent to a word, so there may be close tags in pre_tags, or 
443      open tags in post_tags. 
444   
445      We also keep track of whether the word was originally followed by 
446      whitespace, even though we do not want to treat the word as 
447      equivalent to a similar word that does not have a trailing 
448      space.""" 
449   
450      # When this is true, the token will be eliminated from the 
451      # displayed diff if no change has occurred: 
452      hide_when_equal = False 
453   
454 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): 
455          obj = _unicode.__new__(cls, text) 
456   
457          if pre_tags is not None: 
458              obj.pre_tags = pre_tags 
459          else: 
460              obj.pre_tags = [] 
461   
462          if post_tags is not None: 
463              obj.post_tags = post_tags 
464          else: 
465              obj.post_tags = [] 
466   
467          obj.trailing_whitespace = trailing_whitespace 
468   
469          return obj 
470   
471 -    def __repr__(self): 
472          return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags) 
473   
474 -    def html(self): 
475          return _unicode(self) 
476   
477 -class tag_token(token): 
478   
479      """ Represents a token that is actually a tag.  Currently this is just 
480      the <img> tag, which takes up visible space just like a word but 
481      is only represented in a document by a tag.  """ 
482   
483 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
484                  post_tags=None, trailing_whitespace=False): 
485          obj = token.__new__(cls, "%s: %s" % (type, data),  
486                              pre_tags=pre_tags,  
487                              post_tags=post_tags,  
488                              trailing_whitespace=trailing_whitespace) 
489          obj.tag = tag 
490          obj.data = data 
491          obj.html_repr = html_repr 
492          return obj 
493   
494 -    def __repr__(self): 
495          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 
496              self.tag,  
497              self.data,  
498              self.html_repr,  
499              self.pre_tags,  
500              self.post_tags,  
501              self.trailing_whitespace) 
502 -    def html(self): 
503          return self.html_repr 
504   
505 -class href_token(token): 
506   
507      """ Represents the href in an anchor tag.  Unlike other words, we only 
508      show the href when it changes.  """ 
509   
510      hide_when_equal = True 
511   
512 -    def html(self): 
513          return ' Link: %s' % self 
514   
515 -def tokenize(html, include_hrefs=True): 
516      """ 
517      Parse the given HTML and returns token objects (words with attached tags). 
518   
519      This parses only the content of a page; anything in the head is 
520      ignored, and the <head> and <body> elements are themselves 
521      optional.  The content is then parsed by lxml, which ensures the 
522      validity of the resulting parsed document (though lxml may make 
523      incorrect guesses when the markup is particular bad). 
524   
525      <ins> and <del> tags are also eliminated from the document, as 
526      that gets confusing. 
527   
528      If include_hrefs is true, then the href attribute of <a> tags is 
529      included as a special kind of diffable token.""" 
530      if etree.iselement(html): 
531          body_el = html 
532      else: 
533          body_el = parse_html(html, cleanup=True) 
534      # Then we split the document into text chunks for each tag, word, and end tag: 
535      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
536      # Finally re-joining them into token objects: 
537      return fixup_chunks(chunks) 
538   
539 -def parse_html(html, cleanup=True): 
540      """ 
541      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
542      wrapped in a <div> tag that was not in the original document. 
543   
544      If cleanup is true, make sure there's no <head> or <body>, and get 
545      rid of any <ins> and <del> tags. 
546      """ 
547      if cleanup: 
548          # This removes any extra markup or structure like <head>: 
549          html = cleanup_html(html) 
550      return fragment_fromstring(html, create_parent=True) 
551   
552  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
553  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
554  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
555   
556 -def cleanup_html(html): 
557      """ This 'cleans' the HTML, meaning that any page structure is removed 
558      (only the contents of <body> are used, if there is any <body). 
559      Also <ins> and <del> tags are removed.  """ 
560      match = _body_re.search(html) 
561      if match: 
562          html = html[match.end():] 
563      match = _end_body_re.search(html) 
564      if match: 
565          html = html[:match.start()] 
566      html = _ins_del_re.sub('', html) 
567      return html 
568       
569   
570  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
571   
572 -def fixup_chunks(chunks): 
573      """ 
574      This function takes a list of chunks and produces a list of tokens. 
575      """ 
576      tag_accum = [] 
577      cur_word = None 
578      result = [] 
579      for chunk in chunks: 
580          if isinstance(chunk, tuple): 
581              if chunk[0] == 'img': 
582                  src = chunk[1] 
583                  tag = chunk[2] 
584                  if tag.endswith(' '): 
585                      tag = tag[:-1] 
586                      trailing_whitespace = True 
587                  else: 
588                      trailing_whitespace = False 
589                  cur_word = tag_token('img', src, html_repr=tag, 
590                                       pre_tags=tag_accum, 
591                                       trailing_whitespace=trailing_whitespace) 
592                  tag_accum = [] 
593                  result.append(cur_word) 
594              elif chunk[0] == 'href': 
595                  href = chunk[1] 
596                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 
597                  tag_accum = [] 
598                  result.append(cur_word) 
599              continue 
600          if is_word(chunk): 
601              if chunk.endswith(' '): 
602                  chunk = chunk[:-1] 
603                  trailing_whitespace = True 
604              else: 
605                  trailing_whitespace = False 
606              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
607              tag_accum = [] 
608              result.append(cur_word) 
609          elif is_start_tag(chunk): 
610              tag_accum.append(chunk) 
611          elif is_end_tag(chunk): 
612              if tag_accum: 
613                  tag_accum.append(chunk) 
614              else: 
615                  assert cur_word, ( 
616                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
617                      % (cur_word, result, chunk, chunks)) 
618                  cur_word.post_tags.append(chunk) 
619          else: 
620              assert(0) 
621   
622      if not result: 
623          return [token('', pre_tags=tag_accum)] 
624      else: 
625          result[-1].post_tags.extend(tag_accum) 
626   
627      return result 
628   
629   
630  # All the tags in HTML that don't require end tags: 
631  empty_tags = ( 
632      'param', 'img', 'area', 'br', 'basefont', 'input', 
633      'base', 'meta', 'link', 'col') 
634   
635  block_level_tags = ( 
636      'address', 
637      'blockquote', 
638      'center', 
639      'dir', 
640      'div', 
641      'dl', 
642      'fieldset', 
643      'form', 
644      'h1', 
645      'h2', 
646      'h3', 
647      'h4', 
648      'h5', 
649      'h6', 
650      'hr', 
651      'isindex', 
652      'menu', 
653      'noframes', 
654      'noscript', 
655      'ol', 
656      'p', 
657      'pre', 
658      'table', 
659      'ul', 
660      ) 
661   
662  block_level_container_tags = ( 
663      'dd', 
664      'dt', 
665      'frameset', 
666      'li', 
667      'tbody', 
668      'td', 
669      'tfoot', 
670      'th', 
671      'thead', 
672      'tr', 
673      ) 
674   
675   
676 -def flatten_el(el, include_hrefs, skip_tag=False): 
677      """ Takes an lxml element el, and generates all the text chunks for 
678      that tag.  Each start tag is a chunk, each word is a chunk, and each 
679      end tag is a chunk. 
680   
681      If skip_tag is true, then the outermost container tag is 
682      not returned (just its contents).""" 
683      if not skip_tag: 
684          if el.tag == 'img': 
685              yield ('img', el.get('src'), start_tag(el)) 
686          else: 
687              yield start_tag(el) 
688      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
689          return 
690      start_words = split_words(el.text) 
691      for word in start_words: 
692          yield html_escape(word) 
693      for child in el: 
694          for item in flatten_el(child, include_hrefs=include_hrefs): 
695              yield item 
696      if el.tag == 'a' and el.get('href') and include_hrefs: 
697          yield ('href', el.get('href')) 
698      if not skip_tag: 
699          yield end_tag(el) 
700          end_words = split_words(el.tail) 
701          for word in end_words: 
702              yield html_escape(word) 
703   
704 -def split_words(text): 
705      """ Splits some text into words. Includes trailing whitespace (one 
706      space) on each word when appropriate.  """ 
707      if not text or not text.strip(): 
708          return [] 
709      words = [w + ' ' for w in text.strip().split()] 
710      if not end_whitespace_re.search(text): 
711          words[-1] = words[-1][:-1] 
712      return words 
713   
714  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
715   
716 -def start_tag(el): 
717      """ 
718      The text representation of the start tag for a tag. 
719      """ 
720      return '<%s%s>' % ( 
721          el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 
722                           for name, value in el.attrib.items()])) 
723   
724 -def end_tag(el): 
725      """ The text representation of an end tag for a tag.  Includes 
726      trailing whitespace when appropriate.  """ 
727      if el.tail and start_whitespace_re.search(el.tail): 
728          extra = ' ' 
729      else: 
730          extra = '' 
731      return '</%s>%s' % (el.tag, extra) 
732   
733 -def is_word(tok): 
734      return not tok.startswith('<') 
735   
736 -def is_end_tag(tok): 
737      return tok.startswith('</') 
738   
739 -def is_start_tag(tok): 
740      return tok.startswith('<') and not tok.startswith('</') 
741   
742 -def fixup_ins_del_tags(html): 
743      """ Given an html string, move any <ins> or <del> tags inside of any 
744      block-level elements, e.g. transform <ins><p>word</p></ins> to 
745      <p><ins>word</ins></p> """ 
746      doc = parse_html(html, cleanup=False) 
747      _fixup_ins_del_tags(doc) 
748      html = serialize_html_fragment(doc, skip_outer=True) 
749      return html 
750   
751 -def serialize_html_fragment(el, skip_outer=False): 
752      """ Serialize a single lxml element as HTML.  The serialized form 
753      includes the elements tail.   
754   
755      If skip_outer is true, then don't serialize the outermost tag 
756      """ 
757      assert not isinstance(el, basestring), ( 
758          "You should pass in an element, not a string like %r" % el) 
759      html = etree.tostring(el, method="html", encoding=_unicode) 
760      if skip_outer: 
761          # Get rid of the extra starting tag: 
762          html = html[html.find('>')+1:] 
763          # Get rid of the extra end tag: 
764          html = html[:html.rfind('<')] 
765          return html.strip() 
766      else: 
767          return html 
768   
769 -def _fixup_ins_del_tags(doc): 
770      """fixup_ins_del_tags that works on an lxml document in-place 
771      """ 
772      for tag in ['ins', 'del']: 
773          for el in doc.xpath('descendant-or-self::%s' % tag): 
774              if not _contains_block_level_tag(el): 
775                  continue 
776              _move_el_inside_block(el, tag=tag) 
777              el.drop_tag() 
778              #_merge_element_contents(el) 
779   
780 -def _contains_block_level_tag(el): 
781      """True if the element contains any block-level elements, like <p>, <td>, etc. 
782      """ 
783      if el.tag in block_level_tags or el.tag in block_level_container_tags: 
784          return True 
785      for child in el: 
786          if _contains_block_level_tag(child): 
787              return True 
788      return False 
789   
790 -def _move_el_inside_block(el, tag): 
791      """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 
792      and moves them inside any block-level tags.  """ 
793      for child in el: 
794          if _contains_block_level_tag(child): 
795              break 
796      else: 
797          import sys 
798          # No block-level tags in any child 
799          children_tag = etree.Element(tag) 
800          children_tag.text = el.text 
801          el.text = None 
802          children_tag.extend(list(el)) 
803          el[:] = [children_tag] 
804          return 
805      for child in list(el): 
806          if _contains_block_level_tag(child): 
807              _move_el_inside_block(child, tag) 
808              if child.tail: 
809                  tail_tag = etree.Element(tag) 
810                  tail_tag.text = child.tail 
811                  child.tail = None 
812                  el.insert(el.index(child)+1, tail_tag) 
813          else: 
814              child_tag = etree.Element(tag) 
815              el.replace(child, child_tag) 
816              child_tag.append(child) 
817      if el.text: 
818          text_tag = etree.Element(tag) 
819          text_tag.text = el.text 
820          el.text = None 
821          el.insert(0, text_tag) 
822               
823 -def _merge_element_contents(el): 
824      """ 
825      Removes an element, but merges its contents into its place, e.g., 
826      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
827      <p>Hi there!</p> 
828      """ 
829      parent = el.getparent() 
830      text = el.text or '' 
831      if el.tail: 
832          if not len(el): 
833              text += el.tail 
834          else: 
835              if el[-1].tail: 
836                  el[-1].tail += el.tail 
837              else: 
838                  el[-1].tail = el.tail 
839      index = parent.index(el) 
840      if text: 
841          if index == 0: 
842              previous = None 
843          else: 
844              previous = parent[index-1] 
845          if previous is None: 
846              if parent.text: 
847                  parent.text += text 
848              else: 
849                  parent.text = text 
850          else: 
851              if previous.tail: 
852                  previous.tail += text 
853              else: 
854                  previous.tail = text 
855      parent[index:index+1] = el.getchildren() 
856   
857 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): 
858      """ 
859      Acts like SequenceMatcher, but tries not to find very small equal 
860      blocks amidst large spans of changes 
861      """ 
862   
863      threshold = 2 
864       
865 -    def get_matching_blocks(self): 
866          size = min(len(self.b), len(self.b)) 
867          threshold = min(self.threshold, size / 4) 
868          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
869          return [item for item in actual 
870                  if item[2] > threshold 
871                  or not item[2]] 
872   
873  if __name__ == '__main__': 
874      from lxml.html import _diffcommand 
875      _diffcommand.main() 
876