Package lxml :: Package html :: Module diff
[frames] | no frames]

Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import cgi 
  5  import re 
  6   
  7  __all__ = ['html_annotate', 'htmldiff'] 
  8   
  9   
 10  ############################################################ 
 11  ## Annotation 
 12  ############################################################ 
 13   
14 -def default_markup(text, version):
15 return '<span title="%s">%s</span>' % ( 16 cgi.escape(unicode(version), 1), text)
17
18 -def html_annotate(doclist, markup=default_markup):
19 """ 20 doclist should be ordered from oldest to newest, like:: 21 22 >>> version1 = 'Hello World' 23 >>> version2 = 'Goodbye World' 24 >>> html_annotate([(version1, 'version 1'), 25 ... (version2, 'version 2')]) 26 u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>' 27 28 The documents must be *fragments* (str/UTF8 or unicode), not 29 complete documents 30 31 The markup argument is a function to markup the spans of words. 32 This function is called like markup('Hello', 'version 2'), and 33 returns HTML. The first argument is text and never includes any 34 markup. The default uses a span with a title: 35 36 >>> default_markup('Some Text', 'by Joe') 37 u'<span title="by Joe">Some Text</span>' 38 """ 39 # The basic strategy we have is to split the documents up into 40 # logical tokens (which are words with attached markup). We then 41 # do diffs of each of the versions to track when a token first 42 # appeared in the document; the annotation attached to the token 43 # is the version where it first appeared. 44 tokenlist = [tokenize_annotated(doc, version) 45 for doc, version in doclist] 46 cur_tokens = tokenlist[0] 47 for tokens in tokenlist[1:]: 48 html_annotate_merge_annotations(cur_tokens, tokens) 49 cur_tokens = tokens 50 51 # After we've tracked all the tokens, we can combine spans of text 52 # that are adjacent and have the same annotation 53 cur_tokens = compress_tokens(cur_tokens) 54 # And finally add markup 55 result = markup_serialize_tokens(cur_tokens, markup) 56 return ''.join(result).strip()
57
58 -def tokenize_annotated(doc, annotation):
59 """Tokenize a document and add an annotation attribute to each token 60 """ 61 tokens = tokenize(doc, include_hrefs=False) 62 for tok in tokens: 63 tok.annotation = annotation 64 return tokens 65
66 -def html_annotate_merge_annotations(tokens_old, tokens_new):
67 """Merge the annotations from tokens_old into tokens_new, when the 68 tokens in the new document already existed in the old document. 69 """ 70 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 71 commands = s.get_opcodes() 72 73 for command, i1, i2, j1, j2 in commands: 74 if command == 'equal': 75 eq_old = tokens_old[i1:i2] 76 eq_new = tokens_new[j1:j2] 77 copy_annotations(eq_old, eq_new) 78
79 -def copy_annotations(src, dest):
80 """ 81 Copy annotations from the tokens listed in src to the tokens in dest 82 """ 83 assert len(src) == len(dest) 84 for src_tok, dest_tok in zip(src, dest): 85 dest_tok.annotation = src_tok.annotation 86
87 -def compress_tokens(tokens):
88 """ 89 Combine adjacent tokens when there is no HTML between the tokens, 90 and they share an annotation 91 """ 92 result = [tokens[0]] 93 for tok in tokens[1:]: 94 if (not result[-1].post_tags and 95 not tok.pre_tags and 96 result[-1].annotation == tok.annotation): 97 compress_merge_back(result, tok) 98 else: 99 result.append(tok) 100 return result
101
102 -def compress_merge_back(tokens, tok):
103 """ Merge tok into the last element of tokens (modifying the list of 104 tokens in-place). """ 105 last = tokens[-1] 106 if type(last) is not token or type(tok) is not token: 107 tokens.append(tok) 108 else: 109 text = unicode(last) 110 if last.trailing_whitespace: 111 text += ' ' 112 text += tok 113 merged = token(text, 114 pre_tags=last.pre_tags, 115 post_tags=tok.post_tags, 116 trailing_whitespace=tok.trailing_whitespace) 117 merged.annotation = last.annotation 118 tokens[-1] = merged 119
120 -def markup_serialize_tokens(tokens, markup_func):
121 """ 122 Serialize the list of tokens into a list of text chunks, calling 123 markup_func around text to add annotations. 124 """ 125 for token in tokens: 126 for pre in token.pre_tags: 127 yield pre 128 html = token.html() 129 html = markup_func(html, token.annotation) 130 if token.trailing_whitespace: 131 html += ' ' 132 yield html 133 for post in token.post_tags: 134 yield post
135 136 137 ############################################################ 138 ## HTML Diffs 139 ############################################################ 140
141 -def htmldiff(old_html, new_html):
142 ## FIXME: this should take parsed documents too, and use their body 143 ## or other content. 144 """ Do a diff of the old and new document. The documents are HTML 145 *fragments* (str/UTF8 or unicode), they are not complete documents 146 (i.e., no <html> tag). 147 148 Returns HTML with <ins> and <del> tags added around the 149 appropriate text. 150 151 Markup is generally ignored, with the markup from new_html 152 preserved, and possibly some markup from old_html (though it is 153 considered acceptable to lose some of the old markup). Only the 154 words in the HTML are diffed. The exception is <img> tags, which 155 are treated like words, and the href attribute of <a> tags, which 156 are noted inside the tag itself when there are changes. 157 """ 158 old_html_tokens = tokenize(old_html) 159 new_html_tokens = tokenize(new_html) 160 result = htmldiff_tokens(old_html_tokens, new_html_tokens) 161 result = ''.join(result).strip() 162 return fixup_ins_del_tags(result)
163
164 -def htmldiff_tokens(html1_tokens, html2_tokens):
165 """ Does a diff on the tokens themselves, returning a list of text 166 chunks (not tokens). 167 """ 168 # There are several passes as we do the differences. The tokens 169 # isolate the portion of the content we care to diff; difflib does 170 # all the actual hard work at that point. 171 # 172 # Then we must create a valid document from pieces of both the old 173 # document and the new document. We generally prefer to take 174 # markup from the new document, and only do a best effort attempt 175 # to keep markup from the old document; anything that we can't 176 # resolve we throw away. Also we try to put the deletes as close 177 # to the location where we think they would have been -- because 178 # we are only keeping the markup from the new document, it can be 179 # fuzzy where in the new document the old text would have gone. 180 # Again we just do a best effort attempt. 181 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 182 commands = s.get_opcodes() 183 result = [] 184 for command, i1, i2, j1, j2 in commands: 185 if command == 'equal': 186 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 187 continue 188 if command == 'insert' or command == 'replace': 189 ins_tokens = expand_tokens(html2_tokens[j1:j2]) 190 merge_insert(ins_tokens, result) 191 if command == 'delete' or command == 'replace': 192 del_tokens = expand_tokens(html1_tokens[i1:i2]) 193 merge_delete(del_tokens, result) 194 # If deletes were inserted directly as <del> then we'd have an 195 # invalid document at this point. Instead we put in special 196 # markers, and when the complete diffed document has been created 197 # we try to move the deletes around and resolve any problems. 198 result = cleanup_delete(result) 199 200 return result
201
202 -def expand_tokens(tokens, equal=False):
203 """Given a list of tokens, return a generator of the chunks of 204 text for the data in the tokens. 205 """ 206 for token in tokens: 207 for pre in token.pre_tags: 208 yield pre 209 if not equal or not token.hide_when_equal: 210 if token.trailing_whitespace: 211 yield token.html() + ' ' 212 else: 213 yield token.html() 214 for post in token.post_tags: 215 yield post
216
217 -def merge_insert(ins_chunks, doc):
218 """ doc is the already-handled document (as a list of text chunks); 219 here we add <ins>ins_chunks</ins> to the end of that. """ 220 # Though we don't throw away unbalanced_start or unbalanced_end 221 # (we assume there is accompanying markup later or earlier in the 222 # document), we only put <ins> around the balanced portion. 223 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 224 doc.extend(unbalanced_start) 225 if doc and not doc[-1].endswith(' '): 226 # Fix up the case where the word before the insert didn't end with 227 # a space 228 doc[-1] += ' ' 229 doc.append('<ins>') 230 if balanced and balanced[-1].endswith(' '): 231 # We move space outside of </ins> 232 balanced[-1] = balanced[-1][:-1] 233 doc.extend(balanced) 234 doc.append('</ins> ') 235 doc.extend(unbalanced_end)
236 237 # These are sentinals to represent the start and end of a <del> 238 # segment, until we do the cleanup phase to turn them into proper 239 # markup:
240 -class DEL_START:
241 pass
242 -class DEL_END:
243 pass
244
245 -class NoDeletes(Exception):
246 """ Raised when the document no longer contains any pending deletes 247 (DEL_START/DEL_END) """
248
249 -def merge_delete(del_chunks, doc):
250 """ Adds the text chunks in del_chunks to the document doc (another 251 list of text chunks) with marker to show it is a delete. 252 cleanup_delete later resolves these markers into <del> tags.""" 253 doc.append(DEL_START) 254 doc.extend(del_chunks) 255 doc.append(DEL_END)
256
257 -def cleanup_delete(chunks):
258 """ Cleans up any DEL_START/DEL_END markers in the document, replacing 259 them with <del></del>. To do this while keeping the document 260 valid, it may need to drop some tags (either start or end tags). 261 262 It may also move the del into adjacent tags to try to move it to a 263 similar location where it was originally located (e.g., moving a 264 delete into preceding <div> tag, if the del looks like (DEL_START, 265 'Text</div>', DEL_END)""" 266 while 1: 267 # Find a pending DEL_START/DEL_END, splitting the document 268 # into stuff-preceding-DEL_START, stuff-inside, and 269 # stuff-following-DEL_END 270 try: 271 pre_delete, delete, post_delete = split_delete(chunks) 272 except NoDeletes: 273 # Nothing found, we've cleaned up the entire doc 274 break 275 # The stuff-inside-DEL_START/END may not be well balanced 276 # markup. First we figure out what unbalanced portions there are: 277 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 278 # Then we move the span forward and/or backward based on these 279 # unbalanced portions: 280 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 281 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 282 doc = pre_delete 283 if doc and not doc[-1].endswith(' '): 284 # Fix up case where the word before us didn't have a trailing space 285 doc[-1] += ' ' 286 doc.append('<del>') 287 if balanced and balanced[-1].endswith(' '): 288 # We move space outside of </del> 289 balanced[-1] = balanced[-1][:-1] 290 doc.extend(balanced) 291 doc.append('</del> ') 292 doc.extend(post_delete) 293 chunks = doc 294 return chunks
295
296 -def split_unbalanced(chunks):
297 """Return (unbalanced_start, balanced, unbalanced_end), where each is 298 a list of text and tag chunks. 299 300 unbalanced_start is a list of all the tags that are opened, but 301 not closed in this span. Similarly, unbalanced_end is a list of 302 tags that are closed but were not opened. Extracting these might 303 mean some reordering of the chunks.""" 304 start = [] 305 end = [] 306 tag_stack = [] 307 balanced = [] 308 for chunk in chunks: 309 if not chunk.startswith('<'): 310 balanced.append(chunk) 311 continue 312 endtag = chunk[1] == '/' 313 name = chunk.split()[0].strip('<>/') 314 if name in empty_tags: 315 balanced.append(chunk) 316 continue 317 if endtag: 318 if tag_stack and tag_stack[-1][0] == name: 319 balanced.append(chunk) 320 name, pos, tag = tag_stack.pop() 321 balanced[pos] = tag 322 elif tag_stack: 323 start.extend([tag for name, pos, tag in tag_stack]) 324 tag_stack = [] 325 end.append(chunk) 326 else: 327 end.append(chunk) 328 else: 329 tag_stack.append((name, len(balanced), chunk)) 330 balanced.append(None) 331 start.extend( 332 [chunk for name, pos, chunk in tag_stack]) 333 balanced = [chunk for chunk in balanced if chunk is not None] 334 return start, balanced, end
335
336 -def split_delete(chunks):
337 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 338 stuff_after_DEL_END). Returns the first case found (there may be 339 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if 340 there's no DEL_START found. """ 341 try: 342 pos = chunks.index(DEL_START) 343 except ValueError: 344 raise NoDeletes 345 pos2 = chunks.index(DEL_END) 346 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
347
348 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
349 """ pre_delete and post_delete implicitly point to a place in the 350 document (where the two were split). This moves that point (by 351 popping items from one and pushing them onto the other). It moves 352 the point to try to find a place where unbalanced_start applies. 353 354 As an example:: 355 356 >>> unbalanced_start = ['<div>'] 357 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 358 >>> pre, post = doc[:3], doc[3:] 359 >>> pre, post 360 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 361 >>> locate_unbalanced_start(unbalanced_start, pre, post) 362 >>> pre, post 363 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 364 365 As you can see, we moved the point so that the dangling <div> that 366 we found will be effectively replaced by the div in the original 367 document. If this doesn't work out, we just throw away 368 unbalanced_start without doing anything. 369 """ 370 while 1: 371 if not unbalanced_start: 372 # We have totally succeded in finding the position 373 break 374 finding = unbalanced_start[0] 375 finding_name = finding.split()[0].strip('<>') 376 if not post_delete: 377 break 378 next = post_delete[0] 379 if next is DEL_START or not next.startswith('<'): 380 # Reached a word, we can't move the delete text forward 381 break 382 if next[1] == '/': 383 # Reached a closing tag, can we go further? Maybe not... 384 break 385 name = next.split()[0].strip('<>') 386 if name == 'ins': 387 # Can't move into an insert 388 break 389 assert name != 'del', ( 390 "Unexpected delete tag: %r" % next) 391 if name == finding_name: 392 unbalanced_start.pop(0) 393 pre_delete.append(post_delete.pop(0)) 394 else: 395 # Found a tag that doesn't match 396 break
397
398 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
399 """ like locate_unbalanced_start, except handling end tags and 400 possibly moving the point earlier in the document. """ 401 while 1: 402 if not unbalanced_end: 403 # Success 404 break 405 finding = unbalanced_end[-1] 406 finding_name = finding.split()[0].strip('<>/') 407 if not pre_delete: 408 break 409 next = pre_delete[-1] 410 if next is DEL_END or not next.startswith('</'): 411 # A word or a start tag 412 break 413 name = next.split()[0].strip('<>/') 414 if name == 'ins' or name == 'del': 415 # Can't move into an insert or delete 416 break 417 if name == finding_name: 418 unbalanced_end.pop() 419 post_delete.insert(0, pre_delete.pop()) 420 else: 421 # Found a tag that doesn't match 422 break
423
424 -class token(unicode):
425 """ Represents a diffable token, generally a word that is displayed to 426 the user. Opening tags are attached to this token when they are 427 adjacent (pre_tags) and closing tags that follow the word 428 (post_tags). Some exceptions occur when there are empty tags 429 adjacent to a word, so there may be close tags in pre_tags, or 430 open tags in post_tags. 431 432 We also keep track of whether the word was originally followed by 433 whitespace, even though we do not want to treat the word as 434 equivalent to a similar word that does not have a trailing 435 space.""" 436 437 # When this is true, the token will be eliminated from the 438 # displayed diff if no change has occurred: 439 hide_when_equal = False 440
441 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
442 obj = unicode.__new__(cls, text) 443 444 if pre_tags is not None: 445 obj.pre_tags = pre_tags 446 else: 447 obj.pre_tags = [] 448 449 if post_tags is not None: 450 obj.post_tags = post_tags 451 else: 452 obj.post_tags = [] 453 454 obj.trailing_whitespace = trailing_whitespace 455 456 return obj
457
458 - def __repr__(self):
459 return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags)
460
461 - def html(self):
462 return unicode(self)
463
464 -class tag_token(token):
465 466 """ Represents a token that is actually a tag. Currently this is just 467 the <img> tag, which takes up visible space just like a word but 468 is only represented in a document by a tag. """ 469
470 - def __new__(cls, tag, data, html_repr, pre_tags=None, 471 post_tags=None, trailing_whitespace=False):
472 obj = token.__new__(cls, "%s: %s" % (type, data), 473 pre_tags=pre_tags, 474 post_tags=post_tags, 475 trailing_whitespace=trailing_whitespace) 476 obj.tag = tag 477 obj.data = data 478 obj.html_repr = html_repr 479 return obj
480
481 - def __repr__(self):
482 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 483 self.tag, 484 self.data, 485 self.html_repr, 486 self.pre_tags, 487 self.post_tags, 488 self.trailing_whitespace)
489 - def html(self):
490 return self.html_repr
491
492 -class href_token(token):
493 494 """ Represents the href in an anchor tag. Unlike other words, we only 495 show the href when it changes. """ 496 497 hide_when_equal = True 498
499 - def html(self):
500 return 'Link: %s' % self
501
502 -def tokenize(html, include_hrefs=True):
503 """ 504 Parse the given HTML and returns token objects (words with attached tags). 505 506 This parses only the content of a page; anything in the head is 507 ignored, and the <head> and <body> elements are themselves 508 optional. The content is then parsed by lxml, which ensures the 509 validity of the resulting parsed document (though lxml may make 510 incorrect guesses when the markup is particular bad). 511 512 <ins> and <del> tags are also eliminated from the document, as 513 that gets confusing. 514 515 If include_hrefs is true, then the href attribute of <a> tags is 516 included as a special kind of diffable token.""" 517 body_el = parse_html(html, cleanup=True) 518 # Then we split the document into text chunks for each tag, word, and end tag: 519 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 520 # Finally re-joining them into token objects: 521 return fixup_chunks(chunks)
522
523 -def parse_html(html, cleanup=True):
524 """ 525 Parses an HTML fragment, returning an lxml element. Note that the HTML will be 526 wrapped in a <div> tag that was not in the original document. 527 528 If cleanup is true, make sure there's no <head> or <body>, and get 529 rid of any <ins> and <del> tags. 530 """ 531 if cleanup: 532 # This removes any extra markup or structure like <head>: 533 html = cleanup_html(html) 534 return fragment_fromstring(html, create_parent=True)
535 536 _body_re = re.compile(r'<body.*?>', re.I|re.S) 537 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 538 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 539
540 -def cleanup_html(html):
541 """ This 'cleans' the HTML, meaning that any page structure is removed 542 (only the contents of <body> are used, if there is any <body). 543 Also <ins> and <del> tags are removed. """ 544 match = _body_re.search(html) 545 if match: 546 html = html[match.end():] 547 match = _end_body_re.search(html) 548 if match: 549 html = html[:match.start()] 550 html = _ins_del_re.sub('', html) 551 return html
552 553 554 end_whitespace_re = re.compile(r'[ \t\n\r]$') 555
556 -def fixup_chunks(chunks):
557 """ 558 This function takes a list of chunks and produces a list of tokens. 559 """ 560 tag_accum = [] 561 cur_word = None 562 result = [] 563 for chunk in chunks: 564 if isinstance(chunk, tuple): 565 if chunk[0] == 'img': 566 src = chunk[1] 567 tag = chunk[2] 568 if tag.endswith(' '): 569 tag = tag[:-1] 570 trailing_whitespace = True 571 else: 572 trailing_whitespace = False 573 cur_word = tag_token('img', src, html_repr=tag, 574 pre_tags=tag_accum, 575 trailing_whitespace=trailing_whitespace) 576 tag_accum = [] 577 result.append(cur_word) 578 elif chunk[0] == 'href': 579 href = chunk[1] 580 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 581 tag_accum = [] 582 result.append(cur_word) 583 continue 584 if is_word(chunk): 585 if chunk.endswith(' '): 586 chunk = chunk[:-1] 587 trailing_whitespace = True 588 else: 589 trailing_whitespace = False 590 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 591 tag_accum = [] 592 result.append(cur_word) 593 elif is_start_tag(chunk): 594 tag_accum.append(chunk) 595 elif is_end_tag(chunk): 596 if tag_accum: 597 tag_accum.append(chunk) 598 else: 599 assert cur_word, ( 600 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 601 % (cur_word, result, chunk, chunks)) 602 cur_word.post_tags.append(chunk) 603 else: 604 assert(0) 605 606 if not result: 607 return [token('', pre_tags=tag_accum)] 608 else: 609 result[-1].post_tags.extend(tag_accum) 610 611 return result
612 613 614 # All the tags in HTML that don't require end tags: 615 empty_tags = ( 616 'param', 'img', 'area', 'br', 'basefont', 'input', 617 'base', 'meta', 'link', 'col') 618 619 block_level_tags = ( 620 'address', 621 'blockquote', 622 'center', 623 'dir', 624 'div', 625 'dl', 626 'fieldset', 627 'form', 628 'h1', 629 'h2', 630 'h3', 631 'h4', 632 'h5', 633 'h6', 634 'hr', 635 'isindex', 636 'menu', 637 'noframes', 638 'noscript', 639 'ol', 640 'p', 641 'pre', 642 'table', 643 'ul', 644 ) 645 646 block_level_container_tags = ( 647 'dd', 648 'dt', 649 'frameset', 650 'li', 651 'tbody', 652 'td', 653 'tfoot', 654 'th', 655 'thead', 656 'tr', 657 ) 658 659
660 -def flatten_el(el, include_hrefs, skip_tag=False):
661 """ Takes an lxml element el, and generates all the text chunks for 662 that tag. Each start tag is a chunk, each word is a chunk, and each 663 end tag is a chunk. 664 665 If skip_tag is true, then the outermost container tag is 666 not returned (just its contents).""" 667 if not skip_tag: 668 if el.tag == 'img': 669 yield ('img', el.attrib['src'], start_tag(el)) 670 else: 671 yield start_tag(el) 672 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 673 return 674 start_words = split_words(el.text) 675 for word in start_words: 676 yield cgi.escape(word) 677 for child in el: 678 for item in flatten_el(child, include_hrefs=include_hrefs): 679 yield item 680 if el.tag == 'a' and el.attrib.get('href') and include_hrefs: 681 yield ('href', el.attrib['href']) 682 if not skip_tag: 683 yield end_tag(el) 684 end_words = split_words(el.tail) 685 for word in end_words: 686 yield cgi.escape(word)
687
688 -def split_words(text):
689 """ Splits some text into words. Includes trailing whitespace (one 690 space) on each word when appropriate. """ 691 if not text or not text.strip(): 692 return [] 693 words = [w + ' ' for w in text.strip().split()] 694 if not end_whitespace_re.search(text): 695 words[-1] = words[-1][:-1] 696 return words
697 698 start_whitespace_re = re.compile(r'^[ \t\n\r]') 699
700 -def start_tag(el):
701 """ 702 The text representation of the start tag for a tag. 703 """ 704 return '<%s%s>' % ( 705 el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) 706 for name, value in el.attrib.items()]))
707
708 -def end_tag(el):
709 """ The text representation of an end tag for a tag. Includes 710 trailing whitespace when appropriate. """ 711 if el.tail and start_whitespace_re.search(el.tail): 712 extra = ' ' 713 else: 714 extra = '' 715 return '</%s>%s' % (el.tag, extra)
716
717 -def is_word(tok):
718 return not tok.startswith('<')
719
720 -def is_end_tag(tok):
721 return tok.startswith('</')
722
723 -def is_start_tag(tok):
724 return tok.startswith('<') and not tok.startswith('</')
725
726 -def fixup_ins_del_tags(html):
727 """ Given an html string, move any <ins> or <del> tags inside of any 728 block-level elements, e.g. transform <ins><p>word</p></ins> to 729 <p><ins>word</ins></p> """ 730 doc = parse_html(html, cleanup=False) 731 _fixup_ins_del_tags(doc) 732 html = serialize_html_fragment(doc, skip_outer=True) 733 return html
734
735 -def serialize_html_fragment(el, skip_outer=False):
736 """ Serialize a single lxml element as HTML. The serialized form 737 includes the elements tail. 738 739 If skip_outer is true, then don't serialize the outermost tag 740 """ 741 assert not isinstance(el, basestring), ( 742 "You should pass in an element, not a string like %r" % el) 743 html = etree.tostring(el, method="html", encoding="UTF-8") 744 if skip_outer: 745 # Get rid of the extra starting tag: 746 html = html[html.find('>')+1:] 747 # Get rid of the extra end tag: 748 html = html[:html.rfind('<')] 749 return html.strip() 750 else: 751 return html
752
753 -def _fixup_ins_del_tags(doc):
754 """fixup_ins_del_tags that works on an lxml document in-place 755 """ 756 for tag in ['ins', 'del']: 757 for el in doc.xpath('descendant-or-self::%s' % tag): 758 if not _contains_block_level_tag(el): 759 continue 760 _move_el_inside_block(el, tag=tag) 761 el.drop_tag()
762 #_merge_element_contents(el) 763
764 -def _contains_block_level_tag(el):
765 """True if the element contains any block-level elements, like <p>, <td>, etc. 766 """ 767 if el.tag in block_level_tags or el.tag in block_level_container_tags: 768 return True 769 for child in el: 770 if _contains_block_level_tag(child): 771 return True 772 return False
773
774 -def _move_el_inside_block(el, tag):
775 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 776 and moves them inside any block-level tags. """ 777 for child in el: 778 if _contains_block_level_tag(child): 779 break 780 else: 781 import sys 782 # No block-level tags in any child 783 children_tag = etree.Element(tag) 784 children_tag.text = el.text 785 el.text = None 786 children_tag.extend(list(el)) 787 el[:] = [children_tag] 788 return 789 for child in list(el): 790 if _contains_block_level_tag(child): 791 _move_el_inside_block(child, tag) 792 if child.tail: 793 tail_tag = etree.Element(tag) 794 tail_tag.text = child.tail 795 child.tail = None 796 el.insert(el.index(child)+1, tail_tag) 797 else: 798 child_tag = etree.Element(tag) 799 el.replace(child, child_tag) 800 child_tag.append(child) 801 if el.text: 802 text_tag = etree.Element(tag) 803 text_tag.text = el.text 804 el.text = None 805 el.insert(0, text_tag)
806
807 -def _merge_element_contents(el):
808 """ 809 Removes an element, but merges its contents into its place, e.g., 810 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 811 <p>Hi there!</p> 812 """ 813 parent = el.getparent() 814 text = el.text or '' 815 if el.tail: 816 if not len(el): 817 text += el.tail 818 else: 819 if el[-1].tail: 820 el[-1].tail += el.tail 821 else: 822 el[-1].tail = el.tail 823 index = parent.index(el) 824 if text: 825 if index == 0: 826 previous = None 827 else: 828 previous = parent[index-1] 829 if previous is None: 830 if parent.text: 831 parent.text += text 832 else: 833 parent.text = text 834 else: 835 if previous.tail: 836 previous.tail += text 837 else: 838 previous.tail = text 839 parent[index:index+1] = el.getchildren()
840
841 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
842 """ 843 Acts like SequenceMatcher, but tries not to find very small equal 844 blocks amidst large spans of changes 845 """ 846 847 threshold = 2 848
849 - def get_matching_blocks(self):
850 size = min(len(self.b), len(self.b)) 851 threshold = min(self.threshold, size / 4) 852 actual = difflib.SequenceMatcher.get_matching_blocks(self) 853 return [item for item in actual 854 if item[2] > threshold 855 or not item[2]]
856 857 if __name__ == '__main__': 858 from lxml.html import _diffcommand 859 _diffcommand.main() 860