Package lxml :: Package html :: Module diff
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import re 
  5   
  6  __all__ = ['html_annotate', 'htmldiff'] 
  7   
  8  try: 
  9      from html import escape as html_escape 
 10  except ImportError: 
 11      from cgi import escape as html_escape 
 12  try: 
 13      _unicode = unicode 
 14  except NameError: 
 15      # Python 3 
 16      _unicode = str 
 17  try: 
 18      basestring 
 19  except NameError: 
 20      # Python 3 
 21      basestring = str 
 22   
 23  ############################################################ 
 24  ## Annotation 
 25  ############################################################ 
 26   
27 -def default_markup(text, version):
28 return '<span title="%s">%s</span>' % ( 29 html_escape(_unicode(version), 1), text)
30
31 -def html_annotate(doclist, markup=default_markup):
32 """ 33 doclist should be ordered from oldest to newest, like:: 34 35 >>> version1 = 'Hello World' 36 >>> version2 = 'Goodbye World' 37 >>> print(html_annotate([(version1, 'version 1'), 38 ... (version2, 'version 2')])) 39 <span title="version 2">Goodbye</span> <span title="version 1">World</span> 40 41 The documents must be *fragments* (str/UTF8 or unicode), not 42 complete documents 43 44 The markup argument is a function to markup the spans of words. 45 This function is called like markup('Hello', 'version 2'), and 46 returns HTML. The first argument is text and never includes any 47 markup. The default uses a span with a title: 48 49 >>> print(default_markup('Some Text', 'by Joe')) 50 <span title="by Joe">Some Text</span> 51 """ 52 # The basic strategy we have is to split the documents up into 53 # logical tokens (which are words with attached markup). We then 54 # do diffs of each of the versions to track when a token first 55 # appeared in the document; the annotation attached to the token 56 # is the version where it first appeared. 57 tokenlist = [tokenize_annotated(doc, version) 58 for doc, version in doclist] 59 cur_tokens = tokenlist[0] 60 for tokens in tokenlist[1:]: 61 html_annotate_merge_annotations(cur_tokens, tokens) 62 cur_tokens = tokens 63 64 # After we've tracked all the tokens, we can combine spans of text 65 # that are adjacent and have the same annotation 66 cur_tokens = compress_tokens(cur_tokens) 67 # And finally add markup 68 result = markup_serialize_tokens(cur_tokens, markup) 69 return ''.join(result).strip()
70
71 -def tokenize_annotated(doc, annotation):
72 """Tokenize a document and add an annotation attribute to each token 73 """ 74 tokens = tokenize(doc, include_hrefs=False) 75 for tok in tokens: 76 tok.annotation = annotation 77 return tokens 78
79 -def html_annotate_merge_annotations(tokens_old, tokens_new):
80 """Merge the annotations from tokens_old into tokens_new, when the 81 tokens in the new document already existed in the old document. 82 """ 83 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 84 commands = s.get_opcodes() 85 86 for command, i1, i2, j1, j2 in commands: 87 if command == 'equal': 88 eq_old = tokens_old[i1:i2] 89 eq_new = tokens_new[j1:j2] 90 copy_annotations(eq_old, eq_new) 91
92 -def copy_annotations(src, dest):
93 """ 94 Copy annotations from the tokens listed in src to the tokens in dest 95 """ 96 assert len(src) == len(dest) 97 for src_tok, dest_tok in zip(src, dest): 98 dest_tok.annotation = src_tok.annotation 99
100 -def compress_tokens(tokens):
101 """ 102 Combine adjacent tokens when there is no HTML between the tokens, 103 and they share an annotation 104 """ 105 result = [tokens[0]] 106 for tok in tokens[1:]: 107 if (not result[-1].post_tags and 108 not tok.pre_tags and 109 result[-1].annotation == tok.annotation): 110 compress_merge_back(result, tok) 111 else: 112 result.append(tok) 113 return result
114
115 -def compress_merge_back(tokens, tok):
116 """ Merge tok into the last element of tokens (modifying the list of 117 tokens in-place). """ 118 last = tokens[-1] 119 if type(last) is not token or type(tok) is not token: 120 tokens.append(tok) 121 else: 122 text = _unicode(last) 123 if last.trailing_whitespace: 124 text += ' ' 125 text += tok 126 merged = token(text, 127 pre_tags=last.pre_tags, 128 post_tags=tok.post_tags, 129 trailing_whitespace=tok.trailing_whitespace) 130 merged.annotation = last.annotation 131 tokens[-1] = merged 132
133 -def markup_serialize_tokens(tokens, markup_func):
134 """ 135 Serialize the list of tokens into a list of text chunks, calling 136 markup_func around text to add annotations. 137 """ 138 for token in tokens: 139 for pre in token.pre_tags: 140 yield pre 141 html = token.html() 142 html = markup_func(html, token.annotation) 143 if token.trailing_whitespace: 144 html += ' ' 145 yield html 146 for post in token.post_tags: 147 yield post
148 149 150 ############################################################ 151 ## HTML Diffs 152 ############################################################ 153
154 -def htmldiff(old_html, new_html):
155 ## FIXME: this should take parsed documents too, and use their body 156 ## or other content. 157 """ Do a diff of the old and new document. The documents are HTML 158 *fragments* (str/UTF8 or unicode), they are not complete documents 159 (i.e., no <html> tag). 160 161 Returns HTML with <ins> and <del> tags added around the 162 appropriate text. 163 164 Markup is generally ignored, with the markup from new_html 165 preserved, and possibly some markup from old_html (though it is 166 considered acceptable to lose some of the old markup). Only the 167 words in the HTML are diffed. The exception is <img> tags, which 168 are treated like words, and the href attribute of <a> tags, which 169 are noted inside the tag itself when there are changes. 170 """ 171 old_html_tokens = tokenize(old_html) 172 new_html_tokens = tokenize(new_html) 173 result = htmldiff_tokens(old_html_tokens, new_html_tokens) 174 result = ''.join(result).strip() 175 return fixup_ins_del_tags(result)
176
177 -def htmldiff_tokens(html1_tokens, html2_tokens):
178 """ Does a diff on the tokens themselves, returning a list of text 179 chunks (not tokens). 180 """ 181 # There are several passes as we do the differences. The tokens 182 # isolate the portion of the content we care to diff; difflib does 183 # all the actual hard work at that point. 184 # 185 # Then we must create a valid document from pieces of both the old 186 # document and the new document. We generally prefer to take 187 # markup from the new document, and only do a best effort attempt 188 # to keep markup from the old document; anything that we can't 189 # resolve we throw away. Also we try to put the deletes as close 190 # to the location where we think they would have been -- because 191 # we are only keeping the markup from the new document, it can be 192 # fuzzy where in the new document the old text would have gone. 193 # Again we just do a best effort attempt. 194 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 195 commands = s.get_opcodes() 196 result = [] 197 for command, i1, i2, j1, j2 in commands: 198 if command == 'equal': 199 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 200 continue 201 if command == 'insert' or command == 'replace': 202 ins_tokens = expand_tokens(html2_tokens[j1:j2]) 203 merge_insert(ins_tokens, result) 204 if command == 'delete' or command == 'replace': 205 del_tokens = expand_tokens(html1_tokens[i1:i2]) 206 merge_delete(del_tokens, result) 207 # If deletes were inserted directly as <del> then we'd have an 208 # invalid document at this point. Instead we put in special 209 # markers, and when the complete diffed document has been created 210 # we try to move the deletes around and resolve any problems. 211 result = cleanup_delete(result) 212 213 return result
214
215 -def expand_tokens(tokens, equal=False):
216 """Given a list of tokens, return a generator of the chunks of 217 text for the data in the tokens. 218 """ 219 for token in tokens: 220 for pre in token.pre_tags: 221 yield pre 222 if not equal or not token.hide_when_equal: 223 if token.trailing_whitespace: 224 yield token.html() + ' ' 225 else: 226 yield token.html() 227 for post in token.post_tags: 228 yield post
229
230 -def merge_insert(ins_chunks, doc):
231 """ doc is the already-handled document (as a list of text chunks); 232 here we add <ins>ins_chunks</ins> to the end of that. """ 233 # Though we don't throw away unbalanced_start or unbalanced_end 234 # (we assume there is accompanying markup later or earlier in the 235 # document), we only put <ins> around the balanced portion. 236 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 237 doc.extend(unbalanced_start) 238 if doc and not doc[-1].endswith(' '): 239 # Fix up the case where the word before the insert didn't end with 240 # a space 241 doc[-1] += ' ' 242 doc.append('<ins>') 243 if balanced and balanced[-1].endswith(' '): 244 # We move space outside of </ins> 245 balanced[-1] = balanced[-1][:-1] 246 doc.extend(balanced) 247 doc.append('</ins> ') 248 doc.extend(unbalanced_end)
249 250 # These are sentinals to represent the start and end of a <del> 251 # segment, until we do the cleanup phase to turn them into proper 252 # markup:
253 -class DEL_START:
254 pass
255 -class DEL_END:
256 pass
257
258 -class NoDeletes(Exception):
259 """ Raised when the document no longer contains any pending deletes 260 (DEL_START/DEL_END) """
261
262 -def merge_delete(del_chunks, doc):
263 """ Adds the text chunks in del_chunks to the document doc (another 264 list of text chunks) with marker to show it is a delete. 265 cleanup_delete later resolves these markers into <del> tags.""" 266 doc.append(DEL_START) 267 doc.extend(del_chunks) 268 doc.append(DEL_END)
269
270 -def cleanup_delete(chunks):
271 """ Cleans up any DEL_START/DEL_END markers in the document, replacing 272 them with <del></del>. To do this while keeping the document 273 valid, it may need to drop some tags (either start or end tags). 274 275 It may also move the del into adjacent tags to try to move it to a 276 similar location where it was originally located (e.g., moving a 277 delete into preceding <div> tag, if the del looks like (DEL_START, 278 'Text</div>', DEL_END)""" 279 while 1: 280 # Find a pending DEL_START/DEL_END, splitting the document 281 # into stuff-preceding-DEL_START, stuff-inside, and 282 # stuff-following-DEL_END 283 try: 284 pre_delete, delete, post_delete = split_delete(chunks) 285 except NoDeletes: 286 # Nothing found, we've cleaned up the entire doc 287 break 288 # The stuff-inside-DEL_START/END may not be well balanced 289 # markup. First we figure out what unbalanced portions there are: 290 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 291 # Then we move the span forward and/or backward based on these 292 # unbalanced portions: 293 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 294 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 295 doc = pre_delete 296 if doc and not doc[-1].endswith(' '): 297 # Fix up case where the word before us didn't have a trailing space 298 doc[-1] += ' ' 299 doc.append('<del>') 300 if balanced and balanced[-1].endswith(' '): 301 # We move space outside of </del> 302 balanced[-1] = balanced[-1][:-1] 303 doc.extend(balanced) 304 doc.append('</del> ') 305 doc.extend(post_delete) 306 chunks = doc 307 return chunks
308
309 -def split_unbalanced(chunks):
310 """Return (unbalanced_start, balanced, unbalanced_end), where each is 311 a list of text and tag chunks. 312 313 unbalanced_start is a list of all the tags that are opened, but 314 not closed in this span. Similarly, unbalanced_end is a list of 315 tags that are closed but were not opened. Extracting these might 316 mean some reordering of the chunks.""" 317 start = [] 318 end = [] 319 tag_stack = [] 320 balanced = [] 321 for chunk in chunks: 322 if not chunk.startswith('<'): 323 balanced.append(chunk) 324 continue 325 endtag = chunk[1] == '/' 326 name = chunk.split()[0].strip('<>/') 327 if name in empty_tags: 328 balanced.append(chunk) 329 continue 330 if endtag: 331 if tag_stack and tag_stack[-1][0] == name: 332 balanced.append(chunk) 333 name, pos, tag = tag_stack.pop() 334 balanced[pos] = tag 335 elif tag_stack: 336 start.extend([tag for name, pos, tag in tag_stack]) 337 tag_stack = [] 338 end.append(chunk) 339 else: 340 end.append(chunk) 341 else: 342 tag_stack.append((name, len(balanced), chunk)) 343 balanced.append(None) 344 start.extend( 345 [chunk for name, pos, chunk in tag_stack]) 346 balanced = [chunk for chunk in balanced if chunk is not None] 347 return start, balanced, end
348
349 -def split_delete(chunks):
350 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 351 stuff_after_DEL_END). Returns the first case found (there may be 352 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if 353 there's no DEL_START found. """ 354 try: 355 pos = chunks.index(DEL_START) 356 except ValueError: 357 raise NoDeletes 358 pos2 = chunks.index(DEL_END) 359 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
360
361 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
362 """ pre_delete and post_delete implicitly point to a place in the 363 document (where the two were split). This moves that point (by 364 popping items from one and pushing them onto the other). It moves 365 the point to try to find a place where unbalanced_start applies. 366 367 As an example:: 368 369 >>> unbalanced_start = ['<div>'] 370 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 371 >>> pre, post = doc[:3], doc[3:] 372 >>> pre, post 373 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 374 >>> locate_unbalanced_start(unbalanced_start, pre, post) 375 >>> pre, post 376 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 377 378 As you can see, we moved the point so that the dangling <div> that 379 we found will be effectively replaced by the div in the original 380 document. If this doesn't work out, we just throw away 381 unbalanced_start without doing anything. 382 """ 383 while 1: 384 if not unbalanced_start: 385 # We have totally succeded in finding the position 386 break 387 finding = unbalanced_start[0] 388 finding_name = finding.split()[0].strip('<>') 389 if not post_delete: 390 break 391 next = post_delete[0] 392 if next is DEL_START or not next.startswith('<'): 393 # Reached a word, we can't move the delete text forward 394 break 395 if next[1] == '/': 396 # Reached a closing tag, can we go further? Maybe not... 397 break 398 name = next.split()[0].strip('<>') 399 if name == 'ins': 400 # Can't move into an insert 401 break 402 assert name != 'del', ( 403 "Unexpected delete tag: %r" % next) 404 if name == finding_name: 405 unbalanced_start.pop(0) 406 pre_delete.append(post_delete.pop(0)) 407 else: 408 # Found a tag that doesn't match 409 break
410
411 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
412 """ like locate_unbalanced_start, except handling end tags and 413 possibly moving the point earlier in the document. """ 414 while 1: 415 if not unbalanced_end: 416 # Success 417 break 418 finding = unbalanced_end[-1] 419 finding_name = finding.split()[0].strip('<>/') 420 if not pre_delete: 421 break 422 next = pre_delete[-1] 423 if next is DEL_END or not next.startswith('</'): 424 # A word or a start tag 425 break 426 name = next.split()[0].strip('<>/') 427 if name == 'ins' or name == 'del': 428 # Can't move into an insert or delete 429 break 430 if name == finding_name: 431 unbalanced_end.pop() 432 post_delete.insert(0, pre_delete.pop()) 433 else: 434 # Found a tag that doesn't match 435 break
436
437 -class token(_unicode):
438 """ Represents a diffable token, generally a word that is displayed to 439 the user. Opening tags are attached to this token when they are 440 adjacent (pre_tags) and closing tags that follow the word 441 (post_tags). Some exceptions occur when there are empty tags 442 adjacent to a word, so there may be close tags in pre_tags, or 443 open tags in post_tags. 444 445 We also keep track of whether the word was originally followed by 446 whitespace, even though we do not want to treat the word as 447 equivalent to a similar word that does not have a trailing 448 space.""" 449 450 # When this is true, the token will be eliminated from the 451 # displayed diff if no change has occurred: 452 hide_when_equal = False 453
454 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
455 obj = _unicode.__new__(cls, text) 456 457 if pre_tags is not None: 458 obj.pre_tags = pre_tags 459 else: 460 obj.pre_tags = [] 461 462 if post_tags is not None: 463 obj.post_tags = post_tags 464 else: 465 obj.post_tags = [] 466 467 obj.trailing_whitespace = trailing_whitespace 468 469 return obj
470
471 - def __repr__(self):
472 return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
473
474 - def html(self):
475 return _unicode(self)
476
477 -class tag_token(token):
478 479 """ Represents a token that is actually a tag. Currently this is just 480 the <img> tag, which takes up visible space just like a word but 481 is only represented in a document by a tag. """ 482
483 - def __new__(cls, tag, data, html_repr, pre_tags=None, 484 post_tags=None, trailing_whitespace=False):
485 obj = token.__new__(cls, "%s: %s" % (type, data), 486 pre_tags=pre_tags, 487 post_tags=post_tags, 488 trailing_whitespace=trailing_whitespace) 489 obj.tag = tag 490 obj.data = data 491 obj.html_repr = html_repr 492 return obj
493
494 - def __repr__(self):
495 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 496 self.tag, 497 self.data, 498 self.html_repr, 499 self.pre_tags, 500 self.post_tags, 501 self.trailing_whitespace)
502 - def html(self):
503 return self.html_repr
504
505 -class href_token(token):
506 507 """ Represents the href in an anchor tag. Unlike other words, we only 508 show the href when it changes. """ 509 510 hide_when_equal = True 511
512 - def html(self):
513 return ' Link: %s' % self
514
515 -def tokenize(html, include_hrefs=True):
516 """ 517 Parse the given HTML and returns token objects (words with attached tags). 518 519 This parses only the content of a page; anything in the head is 520 ignored, and the <head> and <body> elements are themselves 521 optional. The content is then parsed by lxml, which ensures the 522 validity of the resulting parsed document (though lxml may make 523 incorrect guesses when the markup is particular bad). 524 525 <ins> and <del> tags are also eliminated from the document, as 526 that gets confusing. 527 528 If include_hrefs is true, then the href attribute of <a> tags is 529 included as a special kind of diffable token.""" 530 if etree.iselement(html): 531 body_el = html 532 else: 533 body_el = parse_html(html, cleanup=True) 534 # Then we split the document into text chunks for each tag, word, and end tag: 535 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 536 # Finally re-joining them into token objects: 537 return fixup_chunks(chunks)
538
539 -def parse_html(html, cleanup=True):
540 """ 541 Parses an HTML fragment, returning an lxml element. Note that the HTML will be 542 wrapped in a <div> tag that was not in the original document. 543 544 If cleanup is true, make sure there's no <head> or <body>, and get 545 rid of any <ins> and <del> tags. 546 """ 547 if cleanup: 548 # This removes any extra markup or structure like <head>: 549 html = cleanup_html(html) 550 return fragment_fromstring(html, create_parent=True)
551 552 _body_re = re.compile(r'<body.*?>', re.I|re.S) 553 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 554 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 555
556 -def cleanup_html(html):
557 """ This 'cleans' the HTML, meaning that any page structure is removed 558 (only the contents of <body> are used, if there is any <body). 559 Also <ins> and <del> tags are removed. """ 560 match = _body_re.search(html) 561 if match: 562 html = html[match.end():] 563 match = _end_body_re.search(html) 564 if match: 565 html = html[:match.start()] 566 html = _ins_del_re.sub('', html) 567 return html
568 569 570 end_whitespace_re = re.compile(r'[ \t\n\r]$') 571
572 -def fixup_chunks(chunks):
573 """ 574 This function takes a list of chunks and produces a list of tokens. 575 """ 576 tag_accum = [] 577 cur_word = None 578 result = [] 579 for chunk in chunks: 580 if isinstance(chunk, tuple): 581 if chunk[0] == 'img': 582 src = chunk[1] 583 tag = chunk[2] 584 if tag.endswith(' '): 585 tag = tag[:-1] 586 trailing_whitespace = True 587 else: 588 trailing_whitespace = False 589 cur_word = tag_token('img', src, html_repr=tag, 590 pre_tags=tag_accum, 591 trailing_whitespace=trailing_whitespace) 592 tag_accum = [] 593 result.append(cur_word) 594 elif chunk[0] == 'href': 595 href = chunk[1] 596 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 597 tag_accum = [] 598 result.append(cur_word) 599 continue 600 if is_word(chunk): 601 if chunk.endswith(' '): 602 chunk = chunk[:-1] 603 trailing_whitespace = True 604 else: 605 trailing_whitespace = False 606 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 607 tag_accum = [] 608 result.append(cur_word) 609 elif is_start_tag(chunk): 610 tag_accum.append(chunk) 611 elif is_end_tag(chunk): 612 if tag_accum: 613 tag_accum.append(chunk) 614 else: 615 assert cur_word, ( 616 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 617 % (cur_word, result, chunk, chunks)) 618 cur_word.post_tags.append(chunk) 619 else: 620 assert(0) 621 622 if not result: 623 return [token('', pre_tags=tag_accum)] 624 else: 625 result[-1].post_tags.extend(tag_accum) 626 627 return result
628 629 630 # All the tags in HTML that don't require end tags: 631 empty_tags = ( 632 'param', 'img', 'area', 'br', 'basefont', 'input', 633 'base', 'meta', 'link', 'col') 634 635 block_level_tags = ( 636 'address', 637 'blockquote', 638 'center', 639 'dir', 640 'div', 641 'dl', 642 'fieldset', 643 'form', 644 'h1', 645 'h2', 646 'h3', 647 'h4', 648 'h5', 649 'h6', 650 'hr', 651 'isindex', 652 'menu', 653 'noframes', 654 'noscript', 655 'ol', 656 'p', 657 'pre', 658 'table', 659 'ul', 660 ) 661 662 block_level_container_tags = ( 663 'dd', 664 'dt', 665 'frameset', 666 'li', 667 'tbody', 668 'td', 669 'tfoot', 670 'th', 671 'thead', 672 'tr', 673 ) 674 675
676 -def flatten_el(el, include_hrefs, skip_tag=False):
677 """ Takes an lxml element el, and generates all the text chunks for 678 that tag. Each start tag is a chunk, each word is a chunk, and each 679 end tag is a chunk. 680 681 If skip_tag is true, then the outermost container tag is 682 not returned (just its contents).""" 683 if not skip_tag: 684 if el.tag == 'img': 685 yield ('img', el.get('src'), start_tag(el)) 686 else: 687 yield start_tag(el) 688 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 689 return 690 start_words = split_words(el.text) 691 for word in start_words: 692 yield html_escape(word) 693 for child in el: 694 for item in flatten_el(child, include_hrefs=include_hrefs): 695 yield item 696 if el.tag == 'a' and el.get('href') and include_hrefs: 697 yield ('href', el.get('href')) 698 if not skip_tag: 699 yield end_tag(el) 700 end_words = split_words(el.tail) 701 for word in end_words: 702 yield html_escape(word)
703
704 -def split_words(text):
705 """ Splits some text into words. Includes trailing whitespace (one 706 space) on each word when appropriate. """ 707 if not text or not text.strip(): 708 return [] 709 words = [w + ' ' for w in text.strip().split()] 710 if not end_whitespace_re.search(text): 711 words[-1] = words[-1][:-1] 712 return words
713 714 start_whitespace_re = re.compile(r'^[ \t\n\r]') 715
716 -def start_tag(el):
717 """ 718 The text representation of the start tag for a tag. 719 """ 720 return '<%s%s>' % ( 721 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 722 for name, value in el.attrib.items()]))
723
724 -def end_tag(el):
725 """ The text representation of an end tag for a tag. Includes 726 trailing whitespace when appropriate. """ 727 if el.tail and start_whitespace_re.search(el.tail): 728 extra = ' ' 729 else: 730 extra = '' 731 return '</%s>%s' % (el.tag, extra)
732
733 -def is_word(tok):
734 return not tok.startswith('<')
735
736 -def is_end_tag(tok):
737 return tok.startswith('</')
738
739 -def is_start_tag(tok):
740 return tok.startswith('<') and not tok.startswith('</')
741
742 -def fixup_ins_del_tags(html):
743 """ Given an html string, move any <ins> or <del> tags inside of any 744 block-level elements, e.g. transform <ins><p>word</p></ins> to 745 <p><ins>word</ins></p> """ 746 doc = parse_html(html, cleanup=False) 747 _fixup_ins_del_tags(doc) 748 html = serialize_html_fragment(doc, skip_outer=True) 749 return html
750
751 -def serialize_html_fragment(el, skip_outer=False):
752 """ Serialize a single lxml element as HTML. The serialized form 753 includes the elements tail. 754 755 If skip_outer is true, then don't serialize the outermost tag 756 """ 757 assert not isinstance(el, basestring), ( 758 "You should pass in an element, not a string like %r" % el) 759 html = etree.tostring(el, method="html", encoding=_unicode) 760 if skip_outer: 761 # Get rid of the extra starting tag: 762 html = html[html.find('>')+1:] 763 # Get rid of the extra end tag: 764 html = html[:html.rfind('<')] 765 return html.strip() 766 else: 767 return html
768
769 -def _fixup_ins_del_tags(doc):
770 """fixup_ins_del_tags that works on an lxml document in-place 771 """ 772 for tag in ['ins', 'del']: 773 for el in doc.xpath('descendant-or-self::%s' % tag): 774 if not _contains_block_level_tag(el): 775 continue 776 _move_el_inside_block(el, tag=tag) 777 el.drop_tag()
778 #_merge_element_contents(el) 779
780 -def _contains_block_level_tag(el):
781 """True if the element contains any block-level elements, like <p>, <td>, etc. 782 """ 783 if el.tag in block_level_tags or el.tag in block_level_container_tags: 784 return True 785 for child in el: 786 if _contains_block_level_tag(child): 787 return True 788 return False
789
790 -def _move_el_inside_block(el, tag):
791 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 792 and moves them inside any block-level tags. """ 793 for child in el: 794 if _contains_block_level_tag(child): 795 break 796 else: 797 import sys 798 # No block-level tags in any child 799 children_tag = etree.Element(tag) 800 children_tag.text = el.text 801 el.text = None 802 children_tag.extend(list(el)) 803 el[:] = [children_tag] 804 return 805 for child in list(el): 806 if _contains_block_level_tag(child): 807 _move_el_inside_block(child, tag) 808 if child.tail: 809 tail_tag = etree.Element(tag) 810 tail_tag.text = child.tail 811 child.tail = None 812 el.insert(el.index(child)+1, tail_tag) 813 else: 814 child_tag = etree.Element(tag) 815 el.replace(child, child_tag) 816 child_tag.append(child) 817 if el.text: 818 text_tag = etree.Element(tag) 819 text_tag.text = el.text 820 el.text = None 821 el.insert(0, text_tag)
822
823 -def _merge_element_contents(el):
824 """ 825 Removes an element, but merges its contents into its place, e.g., 826 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 827 <p>Hi there!</p> 828 """ 829 parent = el.getparent() 830 text = el.text or '' 831 if el.tail: 832 if not len(el): 833 text += el.tail 834 else: 835 if el[-1].tail: 836 el[-1].tail += el.tail 837 else: 838 el[-1].tail = el.tail 839 index = parent.index(el) 840 if text: 841 if index == 0: 842 previous = None 843 else: 844 previous = parent[index-1] 845 if previous is None: 846 if parent.text: 847 parent.text += text 848 else: 849 parent.text = text 850 else: 851 if previous.tail: 852 previous.tail += text 853 else: 854 previous.tail = text 855 parent[index:index+1] = el.getchildren()
856
857 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
858 """ 859 Acts like SequenceMatcher, but tries not to find very small equal 860 blocks amidst large spans of changes 861 """ 862 863 threshold = 2 864
865 - def get_matching_blocks(self):
866 size = min(len(self.b), len(self.b)) 867 threshold = min(self.threshold, size / 4) 868 actual = difflib.SequenceMatcher.get_matching_blocks(self) 869 return [item for item in actual 870 if item[2] > threshold 871 or not item[2]]
872 873 if __name__ == '__main__': 874 from lxml.html import _diffcommand 875 _diffcommand.main() 876