Package lxml :: Package html :: Module diff
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import cgi 
  5  import re 
  6   
  7  __all__ = ['html_annotate', 'htmldiff'] 
  8   
  9  try: 
 10      _unicode = unicode 
 11  except NameError: 
 12      # Python 3 
 13      _unicode = str 
 14  try: 
 15      basestring = __builtins__["basestring"] 
 16  except (KeyError, NameError): 
 17      # Python 3 
 18      basestring = str 
 19   
 20  ############################################################ 
 21  ## Annotation 
 22  ############################################################ 
 23   
24 -def default_markup(text, version):
25 return '<span title="%s">%s</span>' % ( 26 cgi.escape(_unicode(version), 1), text)
27
28 -def html_annotate(doclist, markup=default_markup):
29 """ 30 doclist should be ordered from oldest to newest, like:: 31 32 >>> version1 = 'Hello World' 33 >>> version2 = 'Goodbye World' 34 >>> print(html_annotate([(version1, 'version 1'), 35 ... (version2, 'version 2')])) 36 <span title="version 2">Goodbye</span> <span title="version 1">World</span> 37 38 The documents must be *fragments* (str/UTF8 or unicode), not 39 complete documents 40 41 The markup argument is a function to markup the spans of words. 42 This function is called like markup('Hello', 'version 2'), and 43 returns HTML. The first argument is text and never includes any 44 markup. The default uses a span with a title: 45 46 >>> print(default_markup('Some Text', 'by Joe')) 47 <span title="by Joe">Some Text</span> 48 """ 49 # The basic strategy we have is to split the documents up into 50 # logical tokens (which are words with attached markup). We then 51 # do diffs of each of the versions to track when a token first 52 # appeared in the document; the annotation attached to the token 53 # is the version where it first appeared. 54 tokenlist = [tokenize_annotated(doc, version) 55 for doc, version in doclist] 56 cur_tokens = tokenlist[0] 57 for tokens in tokenlist[1:]: 58 html_annotate_merge_annotations(cur_tokens, tokens) 59 cur_tokens = tokens 60 61 # After we've tracked all the tokens, we can combine spans of text 62 # that are adjacent and have the same annotation 63 cur_tokens = compress_tokens(cur_tokens) 64 # And finally add markup 65 result = markup_serialize_tokens(cur_tokens, markup) 66 return ''.join(result).strip()
67
68 -def tokenize_annotated(doc, annotation):
69 """Tokenize a document and add an annotation attribute to each token 70 """ 71 tokens = tokenize(doc, include_hrefs=False) 72 for tok in tokens: 73 tok.annotation = annotation 74 return tokens 75
76 -def html_annotate_merge_annotations(tokens_old, tokens_new):
77 """Merge the annotations from tokens_old into tokens_new, when the 78 tokens in the new document already existed in the old document. 79 """ 80 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 81 commands = s.get_opcodes() 82 83 for command, i1, i2, j1, j2 in commands: 84 if command == 'equal': 85 eq_old = tokens_old[i1:i2] 86 eq_new = tokens_new[j1:j2] 87 copy_annotations(eq_old, eq_new) 88
89 -def copy_annotations(src, dest):
90 """ 91 Copy annotations from the tokens listed in src to the tokens in dest 92 """ 93 assert len(src) == len(dest) 94 for src_tok, dest_tok in zip(src, dest): 95 dest_tok.annotation = src_tok.annotation 96
97 -def compress_tokens(tokens):
98 """ 99 Combine adjacent tokens when there is no HTML between the tokens, 100 and they share an annotation 101 """ 102 result = [tokens[0]] 103 for tok in tokens[1:]: 104 if (not result[-1].post_tags and 105 not tok.pre_tags and 106 result[-1].annotation == tok.annotation): 107 compress_merge_back(result, tok) 108 else: 109 result.append(tok) 110 return result
111
112 -def compress_merge_back(tokens, tok):
113 """ Merge tok into the last element of tokens (modifying the list of 114 tokens in-place). """ 115 last = tokens[-1] 116 if type(last) is not token or type(tok) is not token: 117 tokens.append(tok) 118 else: 119 text = _unicode(last) 120 if last.trailing_whitespace: 121 text += ' ' 122 text += tok 123 merged = token(text, 124 pre_tags=last.pre_tags, 125 post_tags=tok.post_tags, 126 trailing_whitespace=tok.trailing_whitespace) 127 merged.annotation = last.annotation 128 tokens[-1] = merged 129
130 -def markup_serialize_tokens(tokens, markup_func):
131 """ 132 Serialize the list of tokens into a list of text chunks, calling 133 markup_func around text to add annotations. 134 """ 135 for token in tokens: 136 for pre in token.pre_tags: 137 yield pre 138 html = token.html() 139 html = markup_func(html, token.annotation) 140 if token.trailing_whitespace: 141 html += ' ' 142 yield html 143 for post in token.post_tags: 144 yield post
145 146 147 ############################################################ 148 ## HTML Diffs 149 ############################################################ 150
151 -def htmldiff(old_html, new_html):
152 ## FIXME: this should take parsed documents too, and use their body 153 ## or other content. 154 """ Do a diff of the old and new document. The documents are HTML 155 *fragments* (str/UTF8 or unicode), they are not complete documents 156 (i.e., no <html> tag). 157 158 Returns HTML with <ins> and <del> tags added around the 159 appropriate text. 160 161 Markup is generally ignored, with the markup from new_html 162 preserved, and possibly some markup from old_html (though it is 163 considered acceptable to lose some of the old markup). Only the 164 words in the HTML are diffed. The exception is <img> tags, which 165 are treated like words, and the href attribute of <a> tags, which 166 are noted inside the tag itself when there are changes. 167 """ 168 old_html_tokens = tokenize(old_html) 169 new_html_tokens = tokenize(new_html) 170 result = htmldiff_tokens(old_html_tokens, new_html_tokens) 171 result = ''.join(result).strip() 172 return fixup_ins_del_tags(result)
173
174 -def htmldiff_tokens(html1_tokens, html2_tokens):
175 """ Does a diff on the tokens themselves, returning a list of text 176 chunks (not tokens). 177 """ 178 # There are several passes as we do the differences. The tokens 179 # isolate the portion of the content we care to diff; difflib does 180 # all the actual hard work at that point. 181 # 182 # Then we must create a valid document from pieces of both the old 183 # document and the new document. We generally prefer to take 184 # markup from the new document, and only do a best effort attempt 185 # to keep markup from the old document; anything that we can't 186 # resolve we throw away. Also we try to put the deletes as close 187 # to the location where we think they would have been -- because 188 # we are only keeping the markup from the new document, it can be 189 # fuzzy where in the new document the old text would have gone. 190 # Again we just do a best effort attempt. 191 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 192 commands = s.get_opcodes() 193 result = [] 194 for command, i1, i2, j1, j2 in commands: 195 if command == 'equal': 196 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 197 continue 198 if command == 'insert' or command == 'replace': 199 ins_tokens = expand_tokens(html2_tokens[j1:j2]) 200 merge_insert(ins_tokens, result) 201 if command == 'delete' or command == 'replace': 202 del_tokens = expand_tokens(html1_tokens[i1:i2]) 203 merge_delete(del_tokens, result) 204 # If deletes were inserted directly as <del> then we'd have an 205 # invalid document at this point. Instead we put in special 206 # markers, and when the complete diffed document has been created 207 # we try to move the deletes around and resolve any problems. 208 result = cleanup_delete(result) 209 210 return result
211
212 -def expand_tokens(tokens, equal=False):
213 """Given a list of tokens, return a generator of the chunks of 214 text for the data in the tokens. 215 """ 216 for token in tokens: 217 for pre in token.pre_tags: 218 yield pre 219 if not equal or not token.hide_when_equal: 220 if token.trailing_whitespace: 221 yield token.html() + ' ' 222 else: 223 yield token.html() 224 for post in token.post_tags: 225 yield post
226
227 -def merge_insert(ins_chunks, doc):
228 """ doc is the already-handled document (as a list of text chunks); 229 here we add <ins>ins_chunks</ins> to the end of that. """ 230 # Though we don't throw away unbalanced_start or unbalanced_end 231 # (we assume there is accompanying markup later or earlier in the 232 # document), we only put <ins> around the balanced portion. 233 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 234 doc.extend(unbalanced_start) 235 if doc and not doc[-1].endswith(' '): 236 # Fix up the case where the word before the insert didn't end with 237 # a space 238 doc[-1] += ' ' 239 doc.append('<ins>') 240 if balanced and balanced[-1].endswith(' '): 241 # We move space outside of </ins> 242 balanced[-1] = balanced[-1][:-1] 243 doc.extend(balanced) 244 doc.append('</ins> ') 245 doc.extend(unbalanced_end)
246 247 # These are sentinals to represent the start and end of a <del> 248 # segment, until we do the cleanup phase to turn them into proper 249 # markup:
250 -class DEL_START:
251 pass
252 -class DEL_END:
253 pass
254
255 -class NoDeletes(Exception):
256 """ Raised when the document no longer contains any pending deletes 257 (DEL_START/DEL_END) """
258
259 -def merge_delete(del_chunks, doc):
260 """ Adds the text chunks in del_chunks to the document doc (another 261 list of text chunks) with marker to show it is a delete. 262 cleanup_delete later resolves these markers into <del> tags.""" 263 doc.append(DEL_START) 264 doc.extend(del_chunks) 265 doc.append(DEL_END)
266
267 -def cleanup_delete(chunks):
268 """ Cleans up any DEL_START/DEL_END markers in the document, replacing 269 them with <del></del>. To do this while keeping the document 270 valid, it may need to drop some tags (either start or end tags). 271 272 It may also move the del into adjacent tags to try to move it to a 273 similar location where it was originally located (e.g., moving a 274 delete into preceding <div> tag, if the del looks like (DEL_START, 275 'Text</div>', DEL_END)""" 276 while 1: 277 # Find a pending DEL_START/DEL_END, splitting the document 278 # into stuff-preceding-DEL_START, stuff-inside, and 279 # stuff-following-DEL_END 280 try: 281 pre_delete, delete, post_delete = split_delete(chunks) 282 except NoDeletes: 283 # Nothing found, we've cleaned up the entire doc 284 break 285 # The stuff-inside-DEL_START/END may not be well balanced 286 # markup. First we figure out what unbalanced portions there are: 287 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 288 # Then we move the span forward and/or backward based on these 289 # unbalanced portions: 290 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 291 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 292 doc = pre_delete 293 if doc and not doc[-1].endswith(' '): 294 # Fix up case where the word before us didn't have a trailing space 295 doc[-1] += ' ' 296 doc.append('<del>') 297 if balanced and balanced[-1].endswith(' '): 298 # We move space outside of </del> 299 balanced[-1] = balanced[-1][:-1] 300 doc.extend(balanced) 301 doc.append('</del> ') 302 doc.extend(post_delete) 303 chunks = doc 304 return chunks
305
306 -def split_unbalanced(chunks):
307 """Return (unbalanced_start, balanced, unbalanced_end), where each is 308 a list of text and tag chunks. 309 310 unbalanced_start is a list of all the tags that are opened, but 311 not closed in this span. Similarly, unbalanced_end is a list of 312 tags that are closed but were not opened. Extracting these might 313 mean some reordering of the chunks.""" 314 start = [] 315 end = [] 316 tag_stack = [] 317 balanced = [] 318 for chunk in chunks: 319 if not chunk.startswith('<'): 320 balanced.append(chunk) 321 continue 322 endtag = chunk[1] == '/' 323 name = chunk.split()[0].strip('<>/') 324 if name in empty_tags: 325 balanced.append(chunk) 326 continue 327 if endtag: 328 if tag_stack and tag_stack[-1][0] == name: 329 balanced.append(chunk) 330 name, pos, tag = tag_stack.pop() 331 balanced[pos] = tag 332 elif tag_stack: 333 start.extend([tag for name, pos, tag in tag_stack]) 334 tag_stack = [] 335 end.append(chunk) 336 else: 337 end.append(chunk) 338 else: 339 tag_stack.append((name, len(balanced), chunk)) 340 balanced.append(None) 341 start.extend( 342 [chunk for name, pos, chunk in tag_stack]) 343 balanced = [chunk for chunk in balanced if chunk is not None] 344 return start, balanced, end
345
346 -def split_delete(chunks):
347 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 348 stuff_after_DEL_END). Returns the first case found (there may be 349 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if 350 there's no DEL_START found. """ 351 try: 352 pos = chunks.index(DEL_START) 353 except ValueError: 354 raise NoDeletes 355 pos2 = chunks.index(DEL_END) 356 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
357
358 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
359 """ pre_delete and post_delete implicitly point to a place in the 360 document (where the two were split). This moves that point (by 361 popping items from one and pushing them onto the other). It moves 362 the point to try to find a place where unbalanced_start applies. 363 364 As an example:: 365 366 >>> unbalanced_start = ['<div>'] 367 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 368 >>> pre, post = doc[:3], doc[3:] 369 >>> pre, post 370 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 371 >>> locate_unbalanced_start(unbalanced_start, pre, post) 372 >>> pre, post 373 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 374 375 As you can see, we moved the point so that the dangling <div> that 376 we found will be effectively replaced by the div in the original 377 document. If this doesn't work out, we just throw away 378 unbalanced_start without doing anything. 379 """ 380 while 1: 381 if not unbalanced_start: 382 # We have totally succeded in finding the position 383 break 384 finding = unbalanced_start[0] 385 finding_name = finding.split()[0].strip('<>') 386 if not post_delete: 387 break 388 next = post_delete[0] 389 if next is DEL_START or not next.startswith('<'): 390 # Reached a word, we can't move the delete text forward 391 break 392 if next[1] == '/': 393 # Reached a closing tag, can we go further? Maybe not... 394 break 395 name = next.split()[0].strip('<>') 396 if name == 'ins': 397 # Can't move into an insert 398 break 399 assert name != 'del', ( 400 "Unexpected delete tag: %r" % next) 401 if name == finding_name: 402 unbalanced_start.pop(0) 403 pre_delete.append(post_delete.pop(0)) 404 else: 405 # Found a tag that doesn't match 406 break
407
408 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
409 """ like locate_unbalanced_start, except handling end tags and 410 possibly moving the point earlier in the document. """ 411 while 1: 412 if not unbalanced_end: 413 # Success 414 break 415 finding = unbalanced_end[-1] 416 finding_name = finding.split()[0].strip('<>/') 417 if not pre_delete: 418 break 419 next = pre_delete[-1] 420 if next is DEL_END or not next.startswith('</'): 421 # A word or a start tag 422 break 423 name = next.split()[0].strip('<>/') 424 if name == 'ins' or name == 'del': 425 # Can't move into an insert or delete 426 break 427 if name == finding_name: 428 unbalanced_end.pop() 429 post_delete.insert(0, pre_delete.pop()) 430 else: 431 # Found a tag that doesn't match 432 break
433
434 -class token(_unicode):
435 """ Represents a diffable token, generally a word that is displayed to 436 the user. Opening tags are attached to this token when they are 437 adjacent (pre_tags) and closing tags that follow the word 438 (post_tags). Some exceptions occur when there are empty tags 439 adjacent to a word, so there may be close tags in pre_tags, or 440 open tags in post_tags. 441 442 We also keep track of whether the word was originally followed by 443 whitespace, even though we do not want to treat the word as 444 equivalent to a similar word that does not have a trailing 445 space.""" 446 447 # When this is true, the token will be eliminated from the 448 # displayed diff if no change has occurred: 449 hide_when_equal = False 450
451 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
452 obj = _unicode.__new__(cls, text) 453 454 if pre_tags is not None: 455 obj.pre_tags = pre_tags 456 else: 457 obj.pre_tags = [] 458 459 if post_tags is not None: 460 obj.post_tags = post_tags 461 else: 462 obj.post_tags = [] 463 464 obj.trailing_whitespace = trailing_whitespace 465 466 return obj
467
468 - def __repr__(self):
469 return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
470
471 - def html(self):
472 return _unicode(self)
473
474 -class tag_token(token):
475 476 """ Represents a token that is actually a tag. Currently this is just 477 the <img> tag, which takes up visible space just like a word but 478 is only represented in a document by a tag. """ 479
480 - def __new__(cls, tag, data, html_repr, pre_tags=None, 481 post_tags=None, trailing_whitespace=False):
482 obj = token.__new__(cls, "%s: %s" % (type, data), 483 pre_tags=pre_tags, 484 post_tags=post_tags, 485 trailing_whitespace=trailing_whitespace) 486 obj.tag = tag 487 obj.data = data 488 obj.html_repr = html_repr 489 return obj
490
491 - def __repr__(self):
492 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 493 self.tag, 494 self.data, 495 self.html_repr, 496 self.pre_tags, 497 self.post_tags, 498 self.trailing_whitespace)
499 - def html(self):
500 return self.html_repr
501
502 -class href_token(token):
503 504 """ Represents the href in an anchor tag. Unlike other words, we only 505 show the href when it changes. """ 506 507 hide_when_equal = True 508
509 - def html(self):
510 return ' Link: %s' % self
511
512 -def tokenize(html, include_hrefs=True):
513 """ 514 Parse the given HTML and returns token objects (words with attached tags). 515 516 This parses only the content of a page; anything in the head is 517 ignored, and the <head> and <body> elements are themselves 518 optional. The content is then parsed by lxml, which ensures the 519 validity of the resulting parsed document (though lxml may make 520 incorrect guesses when the markup is particular bad). 521 522 <ins> and <del> tags are also eliminated from the document, as 523 that gets confusing. 524 525 If include_hrefs is true, then the href attribute of <a> tags is 526 included as a special kind of diffable token.""" 527 if etree.iselement(html): 528 body_el = html 529 else: 530 body_el = parse_html(html, cleanup=True) 531 # Then we split the document into text chunks for each tag, word, and end tag: 532 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 533 # Finally re-joining them into token objects: 534 return fixup_chunks(chunks)
535
536 -def parse_html(html, cleanup=True):
537 """ 538 Parses an HTML fragment, returning an lxml element. Note that the HTML will be 539 wrapped in a <div> tag that was not in the original document. 540 541 If cleanup is true, make sure there's no <head> or <body>, and get 542 rid of any <ins> and <del> tags. 543 """ 544 if cleanup: 545 # This removes any extra markup or structure like <head>: 546 html = cleanup_html(html) 547 return fragment_fromstring(html, create_parent=True)
548 549 _body_re = re.compile(r'<body.*?>', re.I|re.S) 550 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 551 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 552
553 -def cleanup_html(html):
554 """ This 'cleans' the HTML, meaning that any page structure is removed 555 (only the contents of <body> are used, if there is any <body). 556 Also <ins> and <del> tags are removed. """ 557 match = _body_re.search(html) 558 if match: 559 html = html[match.end():] 560 match = _end_body_re.search(html) 561 if match: 562 html = html[:match.start()] 563 html = _ins_del_re.sub('', html) 564 return html
565 566 567 end_whitespace_re = re.compile(r'[ \t\n\r]$') 568
569 -def fixup_chunks(chunks):
570 """ 571 This function takes a list of chunks and produces a list of tokens. 572 """ 573 tag_accum = [] 574 cur_word = None 575 result = [] 576 for chunk in chunks: 577 if isinstance(chunk, tuple): 578 if chunk[0] == 'img': 579 src = chunk[1] 580 tag = chunk[2] 581 if tag.endswith(' '): 582 tag = tag[:-1] 583 trailing_whitespace = True 584 else: 585 trailing_whitespace = False 586 cur_word = tag_token('img', src, html_repr=tag, 587 pre_tags=tag_accum, 588 trailing_whitespace=trailing_whitespace) 589 tag_accum = [] 590 result.append(cur_word) 591 elif chunk[0] == 'href': 592 href = chunk[1] 593 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 594 tag_accum = [] 595 result.append(cur_word) 596 continue 597 if is_word(chunk): 598 if chunk.endswith(' '): 599 chunk = chunk[:-1] 600 trailing_whitespace = True 601 else: 602 trailing_whitespace = False 603 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 604 tag_accum = [] 605 result.append(cur_word) 606 elif is_start_tag(chunk): 607 tag_accum.append(chunk) 608 elif is_end_tag(chunk): 609 if tag_accum: 610 tag_accum.append(chunk) 611 else: 612 assert cur_word, ( 613 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 614 % (cur_word, result, chunk, chunks)) 615 cur_word.post_tags.append(chunk) 616 else: 617 assert(0) 618 619 if not result: 620 return [token('', pre_tags=tag_accum)] 621 else: 622 result[-1].post_tags.extend(tag_accum) 623 624 return result
625 626 627 # All the tags in HTML that don't require end tags: 628 empty_tags = ( 629 'param', 'img', 'area', 'br', 'basefont', 'input', 630 'base', 'meta', 'link', 'col') 631 632 block_level_tags = ( 633 'address', 634 'blockquote', 635 'center', 636 'dir', 637 'div', 638 'dl', 639 'fieldset', 640 'form', 641 'h1', 642 'h2', 643 'h3', 644 'h4', 645 'h5', 646 'h6', 647 'hr', 648 'isindex', 649 'menu', 650 'noframes', 651 'noscript', 652 'ol', 653 'p', 654 'pre', 655 'table', 656 'ul', 657 ) 658 659 block_level_container_tags = ( 660 'dd', 661 'dt', 662 'frameset', 663 'li', 664 'tbody', 665 'td', 666 'tfoot', 667 'th', 668 'thead', 669 'tr', 670 ) 671 672
673 -def flatten_el(el, include_hrefs, skip_tag=False):
674 """ Takes an lxml element el, and generates all the text chunks for 675 that tag. Each start tag is a chunk, each word is a chunk, and each 676 end tag is a chunk. 677 678 If skip_tag is true, then the outermost container tag is 679 not returned (just its contents).""" 680 if not skip_tag: 681 if el.tag == 'img': 682 yield ('img', el.get('src'), start_tag(el)) 683 else: 684 yield start_tag(el) 685 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 686 return 687 start_words = split_words(el.text) 688 for word in start_words: 689 yield cgi.escape(word) 690 for child in el: 691 for item in flatten_el(child, include_hrefs=include_hrefs): 692 yield item 693 if el.tag == 'a' and el.get('href') and include_hrefs: 694 yield ('href', el.get('href')) 695 if not skip_tag: 696 yield end_tag(el) 697 end_words = split_words(el.tail) 698 for word in end_words: 699 yield cgi.escape(word)
700
701 -def split_words(text):
702 """ Splits some text into words. Includes trailing whitespace (one 703 space) on each word when appropriate. """ 704 if not text or not text.strip(): 705 return [] 706 words = [w + ' ' for w in text.strip().split()] 707 if not end_whitespace_re.search(text): 708 words[-1] = words[-1][:-1] 709 return words
710 711 start_whitespace_re = re.compile(r'^[ \t\n\r]') 712
713 -def start_tag(el):
714 """ 715 The text representation of the start tag for a tag. 716 """ 717 return '<%s%s>' % ( 718 el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) 719 for name, value in el.attrib.items()]))
720
721 -def end_tag(el):
722 """ The text representation of an end tag for a tag. Includes 723 trailing whitespace when appropriate. """ 724 if el.tail and start_whitespace_re.search(el.tail): 725 extra = ' ' 726 else: 727 extra = '' 728 return '</%s>%s' % (el.tag, extra)
729
730 -def is_word(tok):
731 return not tok.startswith('<')
732
733 -def is_end_tag(tok):
734 return tok.startswith('</')
735
736 -def is_start_tag(tok):
737 return tok.startswith('<') and not tok.startswith('</')
738
739 -def fixup_ins_del_tags(html):
740 """ Given an html string, move any <ins> or <del> tags inside of any 741 block-level elements, e.g. transform <ins><p>word</p></ins> to 742 <p><ins>word</ins></p> """ 743 doc = parse_html(html, cleanup=False) 744 _fixup_ins_del_tags(doc) 745 html = serialize_html_fragment(doc, skip_outer=True) 746 return html
747
748 -def serialize_html_fragment(el, skip_outer=False):
749 """ Serialize a single lxml element as HTML. The serialized form 750 includes the elements tail. 751 752 If skip_outer is true, then don't serialize the outermost tag 753 """ 754 assert not isinstance(el, basestring), ( 755 "You should pass in an element, not a string like %r" % el) 756 html = etree.tostring(el, method="html", encoding=_unicode) 757 if skip_outer: 758 # Get rid of the extra starting tag: 759 html = html[html.find('>')+1:] 760 # Get rid of the extra end tag: 761 html = html[:html.rfind('<')] 762 return html.strip() 763 else: 764 return html
765
766 -def _fixup_ins_del_tags(doc):
767 """fixup_ins_del_tags that works on an lxml document in-place 768 """ 769 for tag in ['ins', 'del']: 770 for el in doc.xpath('descendant-or-self::%s' % tag): 771 if not _contains_block_level_tag(el): 772 continue 773 _move_el_inside_block(el, tag=tag) 774 el.drop_tag()
775 #_merge_element_contents(el) 776
777 -def _contains_block_level_tag(el):
778 """True if the element contains any block-level elements, like <p>, <td>, etc. 779 """ 780 if el.tag in block_level_tags or el.tag in block_level_container_tags: 781 return True 782 for child in el: 783 if _contains_block_level_tag(child): 784 return True 785 return False
786
787 -def _move_el_inside_block(el, tag):
788 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 789 and moves them inside any block-level tags. """ 790 for child in el: 791 if _contains_block_level_tag(child): 792 break 793 else: 794 import sys 795 # No block-level tags in any child 796 children_tag = etree.Element(tag) 797 children_tag.text = el.text 798 el.text = None 799 children_tag.extend(list(el)) 800 el[:] = [children_tag] 801 return 802 for child in list(el): 803 if _contains_block_level_tag(child): 804 _move_el_inside_block(child, tag) 805 if child.tail: 806 tail_tag = etree.Element(tag) 807 tail_tag.text = child.tail 808 child.tail = None 809 el.insert(el.index(child)+1, tail_tag) 810 else: 811 child_tag = etree.Element(tag) 812 el.replace(child, child_tag) 813 child_tag.append(child) 814 if el.text: 815 text_tag = etree.Element(tag) 816 text_tag.text = el.text 817 el.text = None 818 el.insert(0, text_tag)
819
820 -def _merge_element_contents(el):
821 """ 822 Removes an element, but merges its contents into its place, e.g., 823 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 824 <p>Hi there!</p> 825 """ 826 parent = el.getparent() 827 text = el.text or '' 828 if el.tail: 829 if not len(el): 830 text += el.tail 831 else: 832 if el[-1].tail: 833 el[-1].tail += el.tail 834 else: 835 el[-1].tail = el.tail 836 index = parent.index(el) 837 if text: 838 if index == 0: 839 previous = None 840 else: 841 previous = parent[index-1] 842 if previous is None: 843 if parent.text: 844 parent.text += text 845 else: 846 parent.text = text 847 else: 848 if previous.tail: 849 previous.tail += text 850 else: 851 previous.tail = text 852 parent[index:index+1] = el.getchildren()
853
854 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
855 """ 856 Acts like SequenceMatcher, but tries not to find very small equal 857 blocks amidst large spans of changes 858 """ 859 860 threshold = 2 861
862 - def get_matching_blocks(self):
863 size = min(len(self.b), len(self.b)) 864 threshold = min(self.threshold, size / 4) 865 actual = difflib.SequenceMatcher.get_matching_blocks(self) 866 return [item for item in actual 867 if item[2] > threshold 868 or not item[2]]
869 870 if __name__ == '__main__': 871 from lxml.html import _diffcommand 872 _diffcommand.main() 873