Package lxml :: Package html :: Module diff
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.diff

  1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import re 
  5   
  6  __all__ = ['html_annotate', 'htmldiff'] 
  7   
  8  try: 
  9      from html import escape as html_escape 
 10  except ImportError: 
 11      from cgi import escape as html_escape 
 12  try: 
 13      _unicode = unicode 
 14  except NameError: 
 15      # Python 3 
 16      _unicode = str 
 17  try: 
 18      basestring 
 19  except NameError: 
 20      # Python 3 
 21      basestring = str 
 22   
 23  ############################################################ 
 24  ## Annotation 
 25  ############################################################ 
 26   
27 -def default_markup(text, version):
28 return '<span title="%s">%s</span>' % ( 29 html_escape(_unicode(version), 1), text)
30
31 -def html_annotate(doclist, markup=default_markup):
32 """ 33 doclist should be ordered from oldest to newest, like:: 34 35 >>> version1 = 'Hello World' 36 >>> version2 = 'Goodbye World' 37 >>> print(html_annotate([(version1, 'version 1'), 38 ... (version2, 'version 2')])) 39 <span title="version 2">Goodbye</span> <span title="version 1">World</span> 40 41 The documents must be *fragments* (str/UTF8 or unicode), not 42 complete documents 43 44 The markup argument is a function to markup the spans of words. 45 This function is called like markup('Hello', 'version 2'), and 46 returns HTML. The first argument is text and never includes any 47 markup. The default uses a span with a title: 48 49 >>> print(default_markup('Some Text', 'by Joe')) 50 <span title="by Joe">Some Text</span> 51 """ 52 # The basic strategy we have is to split the documents up into 53 # logical tokens (which are words with attached markup). We then 54 # do diffs of each of the versions to track when a token first 55 # appeared in the document; the annotation attached to the token 56 # is the version where it first appeared. 57 tokenlist = [tokenize_annotated(doc, version) 58 for doc, version in doclist] 59 cur_tokens = tokenlist[0] 60 for tokens in tokenlist[1:]: 61 html_annotate_merge_annotations(cur_tokens, tokens) 62 cur_tokens = tokens 63 64 # After we've tracked all the tokens, we can combine spans of text 65 # that are adjacent and have the same annotation 66 cur_tokens = compress_tokens(cur_tokens) 67 # And finally add markup 68 result = markup_serialize_tokens(cur_tokens, markup) 69 return ''.join(result).strip()
70
71 -def tokenize_annotated(doc, annotation):
72 """Tokenize a document and add an annotation attribute to each token 73 """ 74 tokens = tokenize(doc, include_hrefs=False) 75 for tok in tokens: 76 tok.annotation = annotation 77 return tokens 78
79 -def html_annotate_merge_annotations(tokens_old, tokens_new):
80 """Merge the annotations from tokens_old into tokens_new, when the 81 tokens in the new document already existed in the old document. 82 """ 83 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 84 commands = s.get_opcodes() 85 86 for command, i1, i2, j1, j2 in commands: 87 if command == 'equal': 88 eq_old = tokens_old[i1:i2] 89 eq_new = tokens_new[j1:j2] 90 copy_annotations(eq_old, eq_new) 91
92 -def copy_annotations(src, dest):
93 """ 94 Copy annotations from the tokens listed in src to the tokens in dest 95 """ 96 assert len(src) == len(dest) 97 for src_tok, dest_tok in zip(src, dest): 98 dest_tok.annotation = src_tok.annotation 99
100 -def compress_tokens(tokens):
101 """ 102 Combine adjacent tokens when there is no HTML between the tokens, 103 and they share an annotation 104 """ 105 result = [tokens[0]] 106 for tok in tokens[1:]: 107 if (not result[-1].post_tags and 108 not tok.pre_tags and 109 result[-1].annotation == tok.annotation): 110 compress_merge_back(result, tok) 111 else: 112 result.append(tok) 113 return result
114
115 -def compress_merge_back(tokens, tok):
116 """ Merge tok into the last element of tokens (modifying the list of 117 tokens in-place). """ 118 last = tokens[-1] 119 if type(last) is not token or type(tok) is not token: 120 tokens.append(tok) 121 else: 122 text = _unicode(last) 123 if last.trailing_whitespace: 124 text += last.trailing_whitespace 125 text += tok 126 merged = token(text, 127 pre_tags=last.pre_tags, 128 post_tags=tok.post_tags, 129 trailing_whitespace=tok.trailing_whitespace) 130 merged.annotation = last.annotation 131 tokens[-1] = merged 132
133 -def markup_serialize_tokens(tokens, markup_func):
134 """ 135 Serialize the list of tokens into a list of text chunks, calling 136 markup_func around text to add annotations. 137 """ 138 for token in tokens: 139 for pre in token.pre_tags: 140 yield pre 141 html = token.html() 142 html = markup_func(html, token.annotation) 143 if token.trailing_whitespace: 144 html += token.trailing_whitespace 145 yield html 146 for post in token.post_tags: 147 yield post
148 149 150 ############################################################ 151 ## HTML Diffs 152 ############################################################ 153
154 -def htmldiff(old_html, new_html):
155 ## FIXME: this should take parsed documents too, and use their body 156 ## or other content. 157 """ Do a diff of the old and new document. The documents are HTML 158 *fragments* (str/UTF8 or unicode), they are not complete documents 159 (i.e., no <html> tag). 160 161 Returns HTML with <ins> and <del> tags added around the 162 appropriate text. 163 164 Markup is generally ignored, with the markup from new_html 165 preserved, and possibly some markup from old_html (though it is 166 considered acceptable to lose some of the old markup). Only the 167 words in the HTML are diffed. The exception is <img> tags, which 168 are treated like words, and the href attribute of <a> tags, which 169 are noted inside the tag itself when there are changes. 170 """ 171 old_html_tokens = tokenize(old_html) 172 new_html_tokens = tokenize(new_html) 173 result = htmldiff_tokens(old_html_tokens, new_html_tokens) 174 result = ''.join(result).strip() 175 return fixup_ins_del_tags(result)
176
177 -def htmldiff_tokens(html1_tokens, html2_tokens):
178 """ Does a diff on the tokens themselves, returning a list of text 179 chunks (not tokens). 180 """ 181 # There are several passes as we do the differences. The tokens 182 # isolate the portion of the content we care to diff; difflib does 183 # all the actual hard work at that point. 184 # 185 # Then we must create a valid document from pieces of both the old 186 # document and the new document. We generally prefer to take 187 # markup from the new document, and only do a best effort attempt 188 # to keep markup from the old document; anything that we can't 189 # resolve we throw away. Also we try to put the deletes as close 190 # to the location where we think they would have been -- because 191 # we are only keeping the markup from the new document, it can be 192 # fuzzy where in the new document the old text would have gone. 193 # Again we just do a best effort attempt. 194 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 195 commands = s.get_opcodes() 196 result = [] 197 for command, i1, i2, j1, j2 in commands: 198 if command == 'equal': 199 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 200 continue 201 if command == 'insert' or command == 'replace': 202 ins_tokens = expand_tokens(html2_tokens[j1:j2]) 203 merge_insert(ins_tokens, result) 204 if command == 'delete' or command == 'replace': 205 del_tokens = expand_tokens(html1_tokens[i1:i2]) 206 merge_delete(del_tokens, result) 207 # If deletes were inserted directly as <del> then we'd have an 208 # invalid document at this point. Instead we put in special 209 # markers, and when the complete diffed document has been created 210 # we try to move the deletes around and resolve any problems. 211 result = cleanup_delete(result) 212 213 return result
214
215 -def expand_tokens(tokens, equal=False):
216 """Given a list of tokens, return a generator of the chunks of 217 text for the data in the tokens. 218 """ 219 for token in tokens: 220 for pre in token.pre_tags: 221 yield pre 222 if not equal or not token.hide_when_equal: 223 if token.trailing_whitespace: 224 yield token.html() + token.trailing_whitespace 225 else: 226 yield token.html() 227 for post in token.post_tags: 228 yield post
229
230 -def merge_insert(ins_chunks, doc):
231 """ doc is the already-handled document (as a list of text chunks); 232 here we add <ins>ins_chunks</ins> to the end of that. """ 233 # Though we don't throw away unbalanced_start or unbalanced_end 234 # (we assume there is accompanying markup later or earlier in the 235 # document), we only put <ins> around the balanced portion. 236 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 237 doc.extend(unbalanced_start) 238 if doc and not doc[-1].endswith(' '): 239 # Fix up the case where the word before the insert didn't end with 240 # a space 241 doc[-1] += ' ' 242 doc.append('<ins>') 243 if balanced and balanced[-1].endswith(' '): 244 # We move space outside of </ins> 245 balanced[-1] = balanced[-1][:-1] 246 doc.extend(balanced) 247 doc.append('</ins> ') 248 doc.extend(unbalanced_end)
249 250 # These are sentinals to represent the start and end of a <del> 251 # segment, until we do the cleanup phase to turn them into proper 252 # markup:
253 -class DEL_START:
254 pass
255 -class DEL_END:
256 pass
257
258 -class NoDeletes(Exception):
259 """ Raised when the document no longer contains any pending deletes 260 (DEL_START/DEL_END) """
261
262 -def merge_delete(del_chunks, doc):
263 """ Adds the text chunks in del_chunks to the document doc (another 264 list of text chunks) with marker to show it is a delete. 265 cleanup_delete later resolves these markers into <del> tags.""" 266 doc.append(DEL_START) 267 doc.extend(del_chunks) 268 doc.append(DEL_END)
269
270 -def cleanup_delete(chunks):
271 """ Cleans up any DEL_START/DEL_END markers in the document, replacing 272 them with <del></del>. To do this while keeping the document 273 valid, it may need to drop some tags (either start or end tags). 274 275 It may also move the del into adjacent tags to try to move it to a 276 similar location where it was originally located (e.g., moving a 277 delete into preceding <div> tag, if the del looks like (DEL_START, 278 'Text</div>', DEL_END)""" 279 while 1: 280 # Find a pending DEL_START/DEL_END, splitting the document 281 # into stuff-preceding-DEL_START, stuff-inside, and 282 # stuff-following-DEL_END 283 try: 284 pre_delete, delete, post_delete = split_delete(chunks) 285 except NoDeletes: 286 # Nothing found, we've cleaned up the entire doc 287 break 288 # The stuff-inside-DEL_START/END may not be well balanced 289 # markup. First we figure out what unbalanced portions there are: 290 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 291 # Then we move the span forward and/or backward based on these 292 # unbalanced portions: 293 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 294 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 295 doc = pre_delete 296 if doc and not doc[-1].endswith(' '): 297 # Fix up case where the word before us didn't have a trailing space 298 doc[-1] += ' ' 299 doc.append('<del>') 300 if balanced and balanced[-1].endswith(' '): 301 # We move space outside of </del> 302 balanced[-1] = balanced[-1][:-1] 303 doc.extend(balanced) 304 doc.append('</del> ') 305 doc.extend(post_delete) 306 chunks = doc 307 return chunks
308
309 -def split_unbalanced(chunks):
310 """Return (unbalanced_start, balanced, unbalanced_end), where each is 311 a list of text and tag chunks. 312 313 unbalanced_start is a list of all the tags that are opened, but 314 not closed in this span. Similarly, unbalanced_end is a list of 315 tags that are closed but were not opened. Extracting these might 316 mean some reordering of the chunks.""" 317 start = [] 318 end = [] 319 tag_stack = [] 320 balanced = [] 321 for chunk in chunks: 322 if not chunk.startswith('<'): 323 balanced.append(chunk) 324 continue 325 endtag = chunk[1] == '/' 326 name = chunk.split()[0].strip('<>/') 327 if name in empty_tags: 328 balanced.append(chunk) 329 continue 330 if endtag: 331 if tag_stack and tag_stack[-1][0] == name: 332 balanced.append(chunk) 333 name, pos, tag = tag_stack.pop() 334 balanced[pos] = tag 335 elif tag_stack: 336 start.extend([tag for name, pos, tag in tag_stack]) 337 tag_stack = [] 338 end.append(chunk) 339 else: 340 end.append(chunk) 341 else: 342 tag_stack.append((name, len(balanced), chunk)) 343 balanced.append(None) 344 start.extend( 345 [chunk for name, pos, chunk in tag_stack]) 346 balanced = [chunk for chunk in balanced if chunk is not None] 347 return start, balanced, end
348
349 -def split_delete(chunks):
350 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 351 stuff_after_DEL_END). Returns the first case found (there may be 352 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if 353 there's no DEL_START found. """ 354 try: 355 pos = chunks.index(DEL_START) 356 except ValueError: 357 raise NoDeletes 358 pos2 = chunks.index(DEL_END) 359 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
360
361 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
362 """ pre_delete and post_delete implicitly point to a place in the 363 document (where the two were split). This moves that point (by 364 popping items from one and pushing them onto the other). It moves 365 the point to try to find a place where unbalanced_start applies. 366 367 As an example:: 368 369 >>> unbalanced_start = ['<div>'] 370 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 371 >>> pre, post = doc[:3], doc[3:] 372 >>> pre, post 373 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 374 >>> locate_unbalanced_start(unbalanced_start, pre, post) 375 >>> pre, post 376 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 377 378 As you can see, we moved the point so that the dangling <div> that 379 we found will be effectively replaced by the div in the original 380 document. If this doesn't work out, we just throw away 381 unbalanced_start without doing anything. 382 """ 383 while 1: 384 if not unbalanced_start: 385 # We have totally succeeded in finding the position 386 break 387 finding = unbalanced_start[0] 388 finding_name = finding.split()[0].strip('<>') 389 if not post_delete: 390 break 391 next = post_delete[0] 392 if next is DEL_START or not next.startswith('<'): 393 # Reached a word, we can't move the delete text forward 394 break 395 if next[1] == '/': 396 # Reached a closing tag, can we go further? Maybe not... 397 break 398 name = next.split()[0].strip('<>') 399 if name == 'ins': 400 # Can't move into an insert 401 break 402 assert name != 'del', ( 403 "Unexpected delete tag: %r" % next) 404 if name == finding_name: 405 unbalanced_start.pop(0) 406 pre_delete.append(post_delete.pop(0)) 407 else: 408 # Found a tag that doesn't match 409 break
410
411 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
412 """ like locate_unbalanced_start, except handling end tags and 413 possibly moving the point earlier in the document. """ 414 while 1: 415 if not unbalanced_end: 416 # Success 417 break 418 finding = unbalanced_end[-1] 419 finding_name = finding.split()[0].strip('<>/') 420 if not pre_delete: 421 break 422 next = pre_delete[-1] 423 if next is DEL_END or not next.startswith('</'): 424 # A word or a start tag 425 break 426 name = next.split()[0].strip('<>/') 427 if name == 'ins' or name == 'del': 428 # Can't move into an insert or delete 429 break 430 if name == finding_name: 431 unbalanced_end.pop() 432 post_delete.insert(0, pre_delete.pop()) 433 else: 434 # Found a tag that doesn't match 435 break
436
437 -class token(_unicode):
438 """ Represents a diffable token, generally a word that is displayed to 439 the user. Opening tags are attached to this token when they are 440 adjacent (pre_tags) and closing tags that follow the word 441 (post_tags). Some exceptions occur when there are empty tags 442 adjacent to a word, so there may be close tags in pre_tags, or 443 open tags in post_tags. 444 445 We also keep track of whether the word was originally followed by 446 whitespace, even though we do not want to treat the word as 447 equivalent to a similar word that does not have a trailing 448 space.""" 449 450 # When this is true, the token will be eliminated from the 451 # displayed diff if no change has occurred: 452 hide_when_equal = False 453
454 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
455 obj = _unicode.__new__(cls, text) 456 457 if pre_tags is not None: 458 obj.pre_tags = pre_tags 459 else: 460 obj.pre_tags = [] 461 462 if post_tags is not None: 463 obj.post_tags = post_tags 464 else: 465 obj.post_tags = [] 466 467 obj.trailing_whitespace = trailing_whitespace 468 469 return obj
470
471 - def __repr__(self):
472 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 473 self.post_tags, self.trailing_whitespace)
474
475 - def html(self):
476 return _unicode(self)
477
478 -class tag_token(token):
479 480 """ Represents a token that is actually a tag. Currently this is just 481 the <img> tag, which takes up visible space just like a word but 482 is only represented in a document by a tag. """ 483
484 - def __new__(cls, tag, data, html_repr, pre_tags=None, 485 post_tags=None, trailing_whitespace=""):
486 obj = token.__new__(cls, "%s: %s" % (type, data), 487 pre_tags=pre_tags, 488 post_tags=post_tags, 489 trailing_whitespace=trailing_whitespace) 490 obj.tag = tag 491 obj.data = data 492 obj.html_repr = html_repr 493 return obj
494
495 - def __repr__(self):
496 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 497 self.tag, 498 self.data, 499 self.html_repr, 500 self.pre_tags, 501 self.post_tags, 502 self.trailing_whitespace)
503 - def html(self):
504 return self.html_repr
505
506 -class href_token(token):
507 508 """ Represents the href in an anchor tag. Unlike other words, we only 509 show the href when it changes. """ 510 511 hide_when_equal = True 512
513 - def html(self):
514 return ' Link: %s' % self
515
516 -def tokenize(html, include_hrefs=True):
517 """ 518 Parse the given HTML and returns token objects (words with attached tags). 519 520 This parses only the content of a page; anything in the head is 521 ignored, and the <head> and <body> elements are themselves 522 optional. The content is then parsed by lxml, which ensures the 523 validity of the resulting parsed document (though lxml may make 524 incorrect guesses when the markup is particular bad). 525 526 <ins> and <del> tags are also eliminated from the document, as 527 that gets confusing. 528 529 If include_hrefs is true, then the href attribute of <a> tags is 530 included as a special kind of diffable token.""" 531 if etree.iselement(html): 532 body_el = html 533 else: 534 body_el = parse_html(html, cleanup=True) 535 # Then we split the document into text chunks for each tag, word, and end tag: 536 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 537 # Finally re-joining them into token objects: 538 return fixup_chunks(chunks)
539
540 -def parse_html(html, cleanup=True):
541 """ 542 Parses an HTML fragment, returning an lxml element. Note that the HTML will be 543 wrapped in a <div> tag that was not in the original document. 544 545 If cleanup is true, make sure there's no <head> or <body>, and get 546 rid of any <ins> and <del> tags. 547 """ 548 if cleanup: 549 # This removes any extra markup or structure like <head>: 550 html = cleanup_html(html) 551 return fragment_fromstring(html, create_parent=True)
552 553 _body_re = re.compile(r'<body.*?>', re.I|re.S) 554 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 555 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 556
557 -def cleanup_html(html):
558 """ This 'cleans' the HTML, meaning that any page structure is removed 559 (only the contents of <body> are used, if there is any <body). 560 Also <ins> and <del> tags are removed. """ 561 match = _body_re.search(html) 562 if match: 563 html = html[match.end():] 564 match = _end_body_re.search(html) 565 if match: 566 html = html[:match.start()] 567 html = _ins_del_re.sub('', html) 568 return html
569 570 571 end_whitespace_re = re.compile(r'[ \t\n\r]$') 572
573 -def split_trailing_whitespace(word):
574 """ 575 This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 576 """ 577 stripped_length = len(word.rstrip()) 578 return word[0:stripped_length], word[stripped_length:]
579 580
581 -def fixup_chunks(chunks):
582 """ 583 This function takes a list of chunks and produces a list of tokens. 584 """ 585 tag_accum = [] 586 cur_word = None 587 result = [] 588 for chunk in chunks: 589 if isinstance(chunk, tuple): 590 if chunk[0] == 'img': 591 src = chunk[1] 592 tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 593 cur_word = tag_token('img', src, html_repr=tag, 594 pre_tags=tag_accum, 595 trailing_whitespace=trailing_whitespace) 596 tag_accum = [] 597 result.append(cur_word) 598 599 elif chunk[0] == 'href': 600 href = chunk[1] 601 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 602 tag_accum = [] 603 result.append(cur_word) 604 continue 605 606 if is_word(chunk): 607 chunk, trailing_whitespace = split_trailing_whitespace(chunk) 608 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 609 tag_accum = [] 610 result.append(cur_word) 611 612 elif is_start_tag(chunk): 613 tag_accum.append(chunk) 614 615 elif is_end_tag(chunk): 616 if tag_accum: 617 tag_accum.append(chunk) 618 else: 619 assert cur_word, ( 620 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 621 % (cur_word, result, chunk, chunks)) 622 cur_word.post_tags.append(chunk) 623 else: 624 assert(0) 625 626 if not result: 627 return [token('', pre_tags=tag_accum)] 628 else: 629 result[-1].post_tags.extend(tag_accum) 630 631 return result
632 633 634 # All the tags in HTML that don't require end tags: 635 empty_tags = ( 636 'param', 'img', 'area', 'br', 'basefont', 'input', 637 'base', 'meta', 'link', 'col') 638 639 block_level_tags = ( 640 'address', 641 'blockquote', 642 'center', 643 'dir', 644 'div', 645 'dl', 646 'fieldset', 647 'form', 648 'h1', 649 'h2', 650 'h3', 651 'h4', 652 'h5', 653 'h6', 654 'hr', 655 'isindex', 656 'menu', 657 'noframes', 658 'noscript', 659 'ol', 660 'p', 661 'pre', 662 'table', 663 'ul', 664 ) 665 666 block_level_container_tags = ( 667 'dd', 668 'dt', 669 'frameset', 670 'li', 671 'tbody', 672 'td', 673 'tfoot', 674 'th', 675 'thead', 676 'tr', 677 ) 678 679
680 -def flatten_el(el, include_hrefs, skip_tag=False):
681 """ Takes an lxml element el, and generates all the text chunks for 682 that tag. Each start tag is a chunk, each word is a chunk, and each 683 end tag is a chunk. 684 685 If skip_tag is true, then the outermost container tag is 686 not returned (just its contents).""" 687 if not skip_tag: 688 if el.tag == 'img': 689 yield ('img', el.get('src'), start_tag(el)) 690 else: 691 yield start_tag(el) 692 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 693 return 694 start_words = split_words(el.text) 695 for word in start_words: 696 yield html_escape(word) 697 for child in el: 698 for item in flatten_el(child, include_hrefs=include_hrefs): 699 yield item 700 if el.tag == 'a' and el.get('href') and include_hrefs: 701 yield ('href', el.get('href')) 702 if not skip_tag: 703 yield end_tag(el) 704 end_words = split_words(el.tail) 705 for word in end_words: 706 yield html_escape(word)
707 708 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 709
710 -def split_words(text):
711 """ Splits some text into words. Includes trailing whitespace 712 on each word when appropriate. """ 713 if not text or not text.strip(): 714 return [] 715 716 words = split_words_re.findall(text) 717 return words
718 719 start_whitespace_re = re.compile(r'^[ \t\n\r]') 720
721 -def start_tag(el):
722 """ 723 The text representation of the start tag for a tag. 724 """ 725 return '<%s%s>' % ( 726 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 727 for name, value in el.attrib.items()]))
728
729 -def end_tag(el):
730 """ The text representation of an end tag for a tag. Includes 731 trailing whitespace when appropriate. """ 732 if el.tail and start_whitespace_re.search(el.tail): 733 extra = ' ' 734 else: 735 extra = '' 736 return '</%s>%s' % (el.tag, extra)
737
738 -def is_word(tok):
739 return not tok.startswith('<')
740
741 -def is_end_tag(tok):
742 return tok.startswith('</')
743
744 -def is_start_tag(tok):
745 return tok.startswith('<') and not tok.startswith('</')
746
747 -def fixup_ins_del_tags(html):
748 """ Given an html string, move any <ins> or <del> tags inside of any 749 block-level elements, e.g. transform <ins><p>word</p></ins> to 750 <p><ins>word</ins></p> """ 751 doc = parse_html(html, cleanup=False) 752 _fixup_ins_del_tags(doc) 753 html = serialize_html_fragment(doc, skip_outer=True) 754 return html
755
756 -def serialize_html_fragment(el, skip_outer=False):
757 """ Serialize a single lxml element as HTML. The serialized form 758 includes the elements tail. 759 760 If skip_outer is true, then don't serialize the outermost tag 761 """ 762 assert not isinstance(el, basestring), ( 763 "You should pass in an element, not a string like %r" % el) 764 html = etree.tostring(el, method="html", encoding=_unicode) 765 if skip_outer: 766 # Get rid of the extra starting tag: 767 html = html[html.find('>')+1:] 768 # Get rid of the extra end tag: 769 html = html[:html.rfind('<')] 770 return html.strip() 771 else: 772 return html
773
774 -def _fixup_ins_del_tags(doc):
775 """fixup_ins_del_tags that works on an lxml document in-place 776 """ 777 for tag in ['ins', 'del']: 778 for el in doc.xpath('descendant-or-self::%s' % tag): 779 if not _contains_block_level_tag(el): 780 continue 781 _move_el_inside_block(el, tag=tag) 782 el.drop_tag()
783 #_merge_element_contents(el) 784
785 -def _contains_block_level_tag(el):
786 """True if the element contains any block-level elements, like <p>, <td>, etc. 787 """ 788 if el.tag in block_level_tags or el.tag in block_level_container_tags: 789 return True 790 for child in el: 791 if _contains_block_level_tag(child): 792 return True 793 return False
794
795 -def _move_el_inside_block(el, tag):
796 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 797 and moves them inside any block-level tags. """ 798 for child in el: 799 if _contains_block_level_tag(child): 800 break 801 else: 802 import sys 803 # No block-level tags in any child 804 children_tag = etree.Element(tag) 805 children_tag.text = el.text 806 el.text = None 807 children_tag.extend(list(el)) 808 el[:] = [children_tag] 809 return 810 for child in list(el): 811 if _contains_block_level_tag(child): 812 _move_el_inside_block(child, tag) 813 if child.tail: 814 tail_tag = etree.Element(tag) 815 tail_tag.text = child.tail 816 child.tail = None 817 el.insert(el.index(child)+1, tail_tag) 818 else: 819 child_tag = etree.Element(tag) 820 el.replace(child, child_tag) 821 child_tag.append(child) 822 if el.text: 823 text_tag = etree.Element(tag) 824 text_tag.text = el.text 825 el.text = None 826 el.insert(0, text_tag)
827
828 -def _merge_element_contents(el):
829 """ 830 Removes an element, but merges its contents into its place, e.g., 831 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 832 <p>Hi there!</p> 833 """ 834 parent = el.getparent() 835 text = el.text or '' 836 if el.tail: 837 if not len(el): 838 text += el.tail 839 else: 840 if el[-1].tail: 841 el[-1].tail += el.tail 842 else: 843 el[-1].tail = el.tail 844 index = parent.index(el) 845 if text: 846 if index == 0: 847 previous = None 848 else: 849 previous = parent[index-1] 850 if previous is None: 851 if parent.text: 852 parent.text += text 853 else: 854 parent.text = text 855 else: 856 if previous.tail: 857 previous.tail += text 858 else: 859 previous.tail = text 860 parent[index:index+1] = el.getchildren()
861
862 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
863 """ 864 Acts like SequenceMatcher, but tries not to find very small equal 865 blocks amidst large spans of changes 866 """ 867 868 threshold = 2 869
870 - def get_matching_blocks(self):
871 size = min(len(self.b), len(self.b)) 872 threshold = min(self.threshold, size / 4) 873 actual = difflib.SequenceMatcher.get_matching_blocks(self) 874 return [item for item in actual 875 if item[2] > threshold 876 or not item[2]]
877 878 if __name__ == '__main__': 879 from lxml.html import _diffcommand 880 _diffcommand.main() 881