Package lxml :: Package html :: Module diff
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.diff

  1  from __future__ import absolute_import 
  2   
  3  import difflib 
  4  from lxml import etree 
  5  from lxml.html import fragment_fromstring 
  6  import re 
  7   
  8  __all__ = ['html_annotate', 'htmldiff'] 
  9   
 10  try: 
 11      from html import escape as html_escape 
 12  except ImportError: 
 13      from cgi import escape as html_escape 
 14  try: 
 15      _unicode = unicode 
 16  except NameError: 
 17      # Python 3 
 18      _unicode = str 
 19  try: 
 20      basestring 
 21  except NameError: 
 22      # Python 3 
 23      basestring = str 
 24   
 25  ############################################################ 
 26  ## Annotation 
 27  ############################################################ 
 28   
29 -def default_markup(text, version):
30 return '<span title="%s">%s</span>' % ( 31 html_escape(_unicode(version), 1), text)
32
33 -def html_annotate(doclist, markup=default_markup):
34 """ 35 doclist should be ordered from oldest to newest, like:: 36 37 >>> version1 = 'Hello World' 38 >>> version2 = 'Goodbye World' 39 >>> print(html_annotate([(version1, 'version 1'), 40 ... (version2, 'version 2')])) 41 <span title="version 2">Goodbye</span> <span title="version 1">World</span> 42 43 The documents must be *fragments* (str/UTF8 or unicode), not 44 complete documents 45 46 The markup argument is a function to markup the spans of words. 47 This function is called like markup('Hello', 'version 2'), and 48 returns HTML. The first argument is text and never includes any 49 markup. The default uses a span with a title: 50 51 >>> print(default_markup('Some Text', 'by Joe')) 52 <span title="by Joe">Some Text</span> 53 """ 54 # The basic strategy we have is to split the documents up into 55 # logical tokens (which are words with attached markup). We then 56 # do diffs of each of the versions to track when a token first 57 # appeared in the document; the annotation attached to the token 58 # is the version where it first appeared. 59 tokenlist = [tokenize_annotated(doc, version) 60 for doc, version in doclist] 61 cur_tokens = tokenlist[0] 62 for tokens in tokenlist[1:]: 63 html_annotate_merge_annotations(cur_tokens, tokens) 64 cur_tokens = tokens 65 66 # After we've tracked all the tokens, we can combine spans of text 67 # that are adjacent and have the same annotation 68 cur_tokens = compress_tokens(cur_tokens) 69 # And finally add markup 70 result = markup_serialize_tokens(cur_tokens, markup) 71 return ''.join(result).strip()
72
73 -def tokenize_annotated(doc, annotation):
74 """Tokenize a document and add an annotation attribute to each token 75 """ 76 tokens = tokenize(doc, include_hrefs=False) 77 for tok in tokens: 78 tok.annotation = annotation 79 return tokens 80
81 -def html_annotate_merge_annotations(tokens_old, tokens_new):
82 """Merge the annotations from tokens_old into tokens_new, when the 83 tokens in the new document already existed in the old document. 84 """ 85 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 86 commands = s.get_opcodes() 87 88 for command, i1, i2, j1, j2 in commands: 89 if command == 'equal': 90 eq_old = tokens_old[i1:i2] 91 eq_new = tokens_new[j1:j2] 92 copy_annotations(eq_old, eq_new) 93
94 -def copy_annotations(src, dest):
95 """ 96 Copy annotations from the tokens listed in src to the tokens in dest 97 """ 98 assert len(src) == len(dest) 99 for src_tok, dest_tok in zip(src, dest): 100 dest_tok.annotation = src_tok.annotation 101
102 -def compress_tokens(tokens):
103 """ 104 Combine adjacent tokens when there is no HTML between the tokens, 105 and they share an annotation 106 """ 107 result = [tokens[0]] 108 for tok in tokens[1:]: 109 if (not result[-1].post_tags and 110 not tok.pre_tags and 111 result[-1].annotation == tok.annotation): 112 compress_merge_back(result, tok) 113 else: 114 result.append(tok) 115 return result
116
117 -def compress_merge_back(tokens, tok):
118 """ Merge tok into the last element of tokens (modifying the list of 119 tokens in-place). """ 120 last = tokens[-1] 121 if type(last) is not token or type(tok) is not token: 122 tokens.append(tok) 123 else: 124 text = _unicode(last) 125 if last.trailing_whitespace: 126 text += last.trailing_whitespace 127 text += tok 128 merged = token(text, 129 pre_tags=last.pre_tags, 130 post_tags=tok.post_tags, 131 trailing_whitespace=tok.trailing_whitespace) 132 merged.annotation = last.annotation 133 tokens[-1] = merged 134
135 -def markup_serialize_tokens(tokens, markup_func):
136 """ 137 Serialize the list of tokens into a list of text chunks, calling 138 markup_func around text to add annotations. 139 """ 140 for token in tokens: 141 for pre in token.pre_tags: 142 yield pre 143 html = token.html() 144 html = markup_func(html, token.annotation) 145 if token.trailing_whitespace: 146 html += token.trailing_whitespace 147 yield html 148 for post in token.post_tags: 149 yield post
150 151 152 ############################################################ 153 ## HTML Diffs 154 ############################################################ 155
156 -def htmldiff(old_html, new_html):
157 ## FIXME: this should take parsed documents too, and use their body 158 ## or other content. 159 """ Do a diff of the old and new document. The documents are HTML 160 *fragments* (str/UTF8 or unicode), they are not complete documents 161 (i.e., no <html> tag). 162 163 Returns HTML with <ins> and <del> tags added around the 164 appropriate text. 165 166 Markup is generally ignored, with the markup from new_html 167 preserved, and possibly some markup from old_html (though it is 168 considered acceptable to lose some of the old markup). Only the 169 words in the HTML are diffed. The exception is <img> tags, which 170 are treated like words, and the href attribute of <a> tags, which 171 are noted inside the tag itself when there are changes. 172 """ 173 old_html_tokens = tokenize(old_html) 174 new_html_tokens = tokenize(new_html) 175 result = htmldiff_tokens(old_html_tokens, new_html_tokens) 176 result = ''.join(result).strip() 177 return fixup_ins_del_tags(result)
178
179 -def htmldiff_tokens(html1_tokens, html2_tokens):
180 """ Does a diff on the tokens themselves, returning a list of text 181 chunks (not tokens). 182 """ 183 # There are several passes as we do the differences. The tokens 184 # isolate the portion of the content we care to diff; difflib does 185 # all the actual hard work at that point. 186 # 187 # Then we must create a valid document from pieces of both the old 188 # document and the new document. We generally prefer to take 189 # markup from the new document, and only do a best effort attempt 190 # to keep markup from the old document; anything that we can't 191 # resolve we throw away. Also we try to put the deletes as close 192 # to the location where we think they would have been -- because 193 # we are only keeping the markup from the new document, it can be 194 # fuzzy where in the new document the old text would have gone. 195 # Again we just do a best effort attempt. 196 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 197 commands = s.get_opcodes() 198 result = [] 199 for command, i1, i2, j1, j2 in commands: 200 if command == 'equal': 201 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 202 continue 203 if command == 'insert' or command == 'replace': 204 ins_tokens = expand_tokens(html2_tokens[j1:j2]) 205 merge_insert(ins_tokens, result) 206 if command == 'delete' or command == 'replace': 207 del_tokens = expand_tokens(html1_tokens[i1:i2]) 208 merge_delete(del_tokens, result) 209 # If deletes were inserted directly as <del> then we'd have an 210 # invalid document at this point. Instead we put in special 211 # markers, and when the complete diffed document has been created 212 # we try to move the deletes around and resolve any problems. 213 result = cleanup_delete(result) 214 215 return result
216
217 -def expand_tokens(tokens, equal=False):
218 """Given a list of tokens, return a generator of the chunks of 219 text for the data in the tokens. 220 """ 221 for token in tokens: 222 for pre in token.pre_tags: 223 yield pre 224 if not equal or not token.hide_when_equal: 225 if token.trailing_whitespace: 226 yield token.html() + token.trailing_whitespace 227 else: 228 yield token.html() 229 for post in token.post_tags: 230 yield post
231
232 -def merge_insert(ins_chunks, doc):
233 """ doc is the already-handled document (as a list of text chunks); 234 here we add <ins>ins_chunks</ins> to the end of that. """ 235 # Though we don't throw away unbalanced_start or unbalanced_end 236 # (we assume there is accompanying markup later or earlier in the 237 # document), we only put <ins> around the balanced portion. 238 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 239 doc.extend(unbalanced_start) 240 if doc and not doc[-1].endswith(' '): 241 # Fix up the case where the word before the insert didn't end with 242 # a space 243 doc[-1] += ' ' 244 doc.append('<ins>') 245 if balanced and balanced[-1].endswith(' '): 246 # We move space outside of </ins> 247 balanced[-1] = balanced[-1][:-1] 248 doc.extend(balanced) 249 doc.append('</ins> ') 250 doc.extend(unbalanced_end)
251 252 # These are sentinals to represent the start and end of a <del> 253 # segment, until we do the cleanup phase to turn them into proper 254 # markup:
255 -class DEL_START:
256 pass
257 -class DEL_END:
258 pass
259
260 -class NoDeletes(Exception):
261 """ Raised when the document no longer contains any pending deletes 262 (DEL_START/DEL_END) """
263
264 -def merge_delete(del_chunks, doc):
265 """ Adds the text chunks in del_chunks to the document doc (another 266 list of text chunks) with marker to show it is a delete. 267 cleanup_delete later resolves these markers into <del> tags.""" 268 doc.append(DEL_START) 269 doc.extend(del_chunks) 270 doc.append(DEL_END)
271
272 -def cleanup_delete(chunks):
273 """ Cleans up any DEL_START/DEL_END markers in the document, replacing 274 them with <del></del>. To do this while keeping the document 275 valid, it may need to drop some tags (either start or end tags). 276 277 It may also move the del into adjacent tags to try to move it to a 278 similar location where it was originally located (e.g., moving a 279 delete into preceding <div> tag, if the del looks like (DEL_START, 280 'Text</div>', DEL_END)""" 281 while 1: 282 # Find a pending DEL_START/DEL_END, splitting the document 283 # into stuff-preceding-DEL_START, stuff-inside, and 284 # stuff-following-DEL_END 285 try: 286 pre_delete, delete, post_delete = split_delete(chunks) 287 except NoDeletes: 288 # Nothing found, we've cleaned up the entire doc 289 break 290 # The stuff-inside-DEL_START/END may not be well balanced 291 # markup. First we figure out what unbalanced portions there are: 292 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 293 # Then we move the span forward and/or backward based on these 294 # unbalanced portions: 295 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 296 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 297 doc = pre_delete 298 if doc and not doc[-1].endswith(' '): 299 # Fix up case where the word before us didn't have a trailing space 300 doc[-1] += ' ' 301 doc.append('<del>') 302 if balanced and balanced[-1].endswith(' '): 303 # We move space outside of </del> 304 balanced[-1] = balanced[-1][:-1] 305 doc.extend(balanced) 306 doc.append('</del> ') 307 doc.extend(post_delete) 308 chunks = doc 309 return chunks
310
311 -def split_unbalanced(chunks):
312 """Return (unbalanced_start, balanced, unbalanced_end), where each is 313 a list of text and tag chunks. 314 315 unbalanced_start is a list of all the tags that are opened, but 316 not closed in this span. Similarly, unbalanced_end is a list of 317 tags that are closed but were not opened. Extracting these might 318 mean some reordering of the chunks.""" 319 start = [] 320 end = [] 321 tag_stack = [] 322 balanced = [] 323 for chunk in chunks: 324 if not chunk.startswith('<'): 325 balanced.append(chunk) 326 continue 327 endtag = chunk[1] == '/' 328 name = chunk.split()[0].strip('<>/') 329 if name in empty_tags: 330 balanced.append(chunk) 331 continue 332 if endtag: 333 if tag_stack and tag_stack[-1][0] == name: 334 balanced.append(chunk) 335 name, pos, tag = tag_stack.pop() 336 balanced[pos] = tag 337 elif tag_stack: 338 start.extend([tag for name, pos, tag in tag_stack]) 339 tag_stack = [] 340 end.append(chunk) 341 else: 342 end.append(chunk) 343 else: 344 tag_stack.append((name, len(balanced), chunk)) 345 balanced.append(None) 346 start.extend( 347 [chunk for name, pos, chunk in tag_stack]) 348 balanced = [chunk for chunk in balanced if chunk is not None] 349 return start, balanced, end
350
351 -def split_delete(chunks):
352 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 353 stuff_after_DEL_END). Returns the first case found (there may be 354 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if 355 there's no DEL_START found. """ 356 try: 357 pos = chunks.index(DEL_START) 358 except ValueError: 359 raise NoDeletes 360 pos2 = chunks.index(DEL_END) 361 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
362
363 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
364 """ pre_delete and post_delete implicitly point to a place in the 365 document (where the two were split). This moves that point (by 366 popping items from one and pushing them onto the other). It moves 367 the point to try to find a place where unbalanced_start applies. 368 369 As an example:: 370 371 >>> unbalanced_start = ['<div>'] 372 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 373 >>> pre, post = doc[:3], doc[3:] 374 >>> pre, post 375 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 376 >>> locate_unbalanced_start(unbalanced_start, pre, post) 377 >>> pre, post 378 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 379 380 As you can see, we moved the point so that the dangling <div> that 381 we found will be effectively replaced by the div in the original 382 document. If this doesn't work out, we just throw away 383 unbalanced_start without doing anything. 384 """ 385 while 1: 386 if not unbalanced_start: 387 # We have totally succeeded in finding the position 388 break 389 finding = unbalanced_start[0] 390 finding_name = finding.split()[0].strip('<>') 391 if not post_delete: 392 break 393 next = post_delete[0] 394 if next is DEL_START or not next.startswith('<'): 395 # Reached a word, we can't move the delete text forward 396 break 397 if next[1] == '/': 398 # Reached a closing tag, can we go further? Maybe not... 399 break 400 name = next.split()[0].strip('<>') 401 if name == 'ins': 402 # Can't move into an insert 403 break 404 assert name != 'del', ( 405 "Unexpected delete tag: %r" % next) 406 if name == finding_name: 407 unbalanced_start.pop(0) 408 pre_delete.append(post_delete.pop(0)) 409 else: 410 # Found a tag that doesn't match 411 break
412
413 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
414 """ like locate_unbalanced_start, except handling end tags and 415 possibly moving the point earlier in the document. """ 416 while 1: 417 if not unbalanced_end: 418 # Success 419 break 420 finding = unbalanced_end[-1] 421 finding_name = finding.split()[0].strip('<>/') 422 if not pre_delete: 423 break 424 next = pre_delete[-1] 425 if next is DEL_END or not next.startswith('</'): 426 # A word or a start tag 427 break 428 name = next.split()[0].strip('<>/') 429 if name == 'ins' or name == 'del': 430 # Can't move into an insert or delete 431 break 432 if name == finding_name: 433 unbalanced_end.pop() 434 post_delete.insert(0, pre_delete.pop()) 435 else: 436 # Found a tag that doesn't match 437 break
438
439 -class token(_unicode):
440 """ Represents a diffable token, generally a word that is displayed to 441 the user. Opening tags are attached to this token when they are 442 adjacent (pre_tags) and closing tags that follow the word 443 (post_tags). Some exceptions occur when there are empty tags 444 adjacent to a word, so there may be close tags in pre_tags, or 445 open tags in post_tags. 446 447 We also keep track of whether the word was originally followed by 448 whitespace, even though we do not want to treat the word as 449 equivalent to a similar word that does not have a trailing 450 space.""" 451 452 # When this is true, the token will be eliminated from the 453 # displayed diff if no change has occurred: 454 hide_when_equal = False 455
456 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
457 obj = _unicode.__new__(cls, text) 458 459 if pre_tags is not None: 460 obj.pre_tags = pre_tags 461 else: 462 obj.pre_tags = [] 463 464 if post_tags is not None: 465 obj.post_tags = post_tags 466 else: 467 obj.post_tags = [] 468 469 obj.trailing_whitespace = trailing_whitespace 470 471 return obj
472
473 - def __repr__(self):
474 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 475 self.post_tags, self.trailing_whitespace)
476
477 - def html(self):
478 return _unicode(self)
479
480 -class tag_token(token):
481 482 """ Represents a token that is actually a tag. Currently this is just 483 the <img> tag, which takes up visible space just like a word but 484 is only represented in a document by a tag. """ 485
486 - def __new__(cls, tag, data, html_repr, pre_tags=None, 487 post_tags=None, trailing_whitespace=""):
488 obj = token.__new__(cls, "%s: %s" % (type, data), 489 pre_tags=pre_tags, 490 post_tags=post_tags, 491 trailing_whitespace=trailing_whitespace) 492 obj.tag = tag 493 obj.data = data 494 obj.html_repr = html_repr 495 return obj
496
497 - def __repr__(self):
498 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 499 self.tag, 500 self.data, 501 self.html_repr, 502 self.pre_tags, 503 self.post_tags, 504 self.trailing_whitespace)
505 - def html(self):
506 return self.html_repr
507
508 -class href_token(token):
509 510 """ Represents the href in an anchor tag. Unlike other words, we only 511 show the href when it changes. """ 512 513 hide_when_equal = True 514
515 - def html(self):
516 return ' Link: %s' % self
517
518 -def tokenize(html, include_hrefs=True):
519 """ 520 Parse the given HTML and returns token objects (words with attached tags). 521 522 This parses only the content of a page; anything in the head is 523 ignored, and the <head> and <body> elements are themselves 524 optional. The content is then parsed by lxml, which ensures the 525 validity of the resulting parsed document (though lxml may make 526 incorrect guesses when the markup is particular bad). 527 528 <ins> and <del> tags are also eliminated from the document, as 529 that gets confusing. 530 531 If include_hrefs is true, then the href attribute of <a> tags is 532 included as a special kind of diffable token.""" 533 if etree.iselement(html): 534 body_el = html 535 else: 536 body_el = parse_html(html, cleanup=True) 537 # Then we split the document into text chunks for each tag, word, and end tag: 538 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 539 # Finally re-joining them into token objects: 540 return fixup_chunks(chunks)
541
542 -def parse_html(html, cleanup=True):
543 """ 544 Parses an HTML fragment, returning an lxml element. Note that the HTML will be 545 wrapped in a <div> tag that was not in the original document. 546 547 If cleanup is true, make sure there's no <head> or <body>, and get 548 rid of any <ins> and <del> tags. 549 """ 550 if cleanup: 551 # This removes any extra markup or structure like <head>: 552 html = cleanup_html(html) 553 return fragment_fromstring(html, create_parent=True)
554 555 _body_re = re.compile(r'<body.*?>', re.I|re.S) 556 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 557 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 558
559 -def cleanup_html(html):
560 """ This 'cleans' the HTML, meaning that any page structure is removed 561 (only the contents of <body> are used, if there is any <body). 562 Also <ins> and <del> tags are removed. """ 563 match = _body_re.search(html) 564 if match: 565 html = html[match.end():] 566 match = _end_body_re.search(html) 567 if match: 568 html = html[:match.start()] 569 html = _ins_del_re.sub('', html) 570 return html
571 572 573 end_whitespace_re = re.compile(r'[ \t\n\r]$') 574
575 -def split_trailing_whitespace(word):
576 """ 577 This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 578 """ 579 stripped_length = len(word.rstrip()) 580 return word[0:stripped_length], word[stripped_length:]
581 582
583 -def fixup_chunks(chunks):
584 """ 585 This function takes a list of chunks and produces a list of tokens. 586 """ 587 tag_accum = [] 588 cur_word = None 589 result = [] 590 for chunk in chunks: 591 if isinstance(chunk, tuple): 592 if chunk[0] == 'img': 593 src = chunk[1] 594 tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 595 cur_word = tag_token('img', src, html_repr=tag, 596 pre_tags=tag_accum, 597 trailing_whitespace=trailing_whitespace) 598 tag_accum = [] 599 result.append(cur_word) 600 601 elif chunk[0] == 'href': 602 href = chunk[1] 603 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 604 tag_accum = [] 605 result.append(cur_word) 606 continue 607 608 if is_word(chunk): 609 chunk, trailing_whitespace = split_trailing_whitespace(chunk) 610 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 611 tag_accum = [] 612 result.append(cur_word) 613 614 elif is_start_tag(chunk): 615 tag_accum.append(chunk) 616 617 elif is_end_tag(chunk): 618 if tag_accum: 619 tag_accum.append(chunk) 620 else: 621 assert cur_word, ( 622 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 623 % (cur_word, result, chunk, chunks)) 624 cur_word.post_tags.append(chunk) 625 else: 626 assert(0) 627 628 if not result: 629 return [token('', pre_tags=tag_accum)] 630 else: 631 result[-1].post_tags.extend(tag_accum) 632 633 return result
634 635 636 # All the tags in HTML that don't require end tags: 637 empty_tags = ( 638 'param', 'img', 'area', 'br', 'basefont', 'input', 639 'base', 'meta', 'link', 'col') 640 641 block_level_tags = ( 642 'address', 643 'blockquote', 644 'center', 645 'dir', 646 'div', 647 'dl', 648 'fieldset', 649 'form', 650 'h1', 651 'h2', 652 'h3', 653 'h4', 654 'h5', 655 'h6', 656 'hr', 657 'isindex', 658 'menu', 659 'noframes', 660 'noscript', 661 'ol', 662 'p', 663 'pre', 664 'table', 665 'ul', 666 ) 667 668 block_level_container_tags = ( 669 'dd', 670 'dt', 671 'frameset', 672 'li', 673 'tbody', 674 'td', 675 'tfoot', 676 'th', 677 'thead', 678 'tr', 679 ) 680 681
682 -def flatten_el(el, include_hrefs, skip_tag=False):
683 """ Takes an lxml element el, and generates all the text chunks for 684 that tag. Each start tag is a chunk, each word is a chunk, and each 685 end tag is a chunk. 686 687 If skip_tag is true, then the outermost container tag is 688 not returned (just its contents).""" 689 if not skip_tag: 690 if el.tag == 'img': 691 yield ('img', el.get('src'), start_tag(el)) 692 else: 693 yield start_tag(el) 694 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 695 return 696 start_words = split_words(el.text) 697 for word in start_words: 698 yield html_escape(word) 699 for child in el: 700 for item in flatten_el(child, include_hrefs=include_hrefs): 701 yield item 702 if el.tag == 'a' and el.get('href') and include_hrefs: 703 yield ('href', el.get('href')) 704 if not skip_tag: 705 yield end_tag(el) 706 end_words = split_words(el.tail) 707 for word in end_words: 708 yield html_escape(word)
709 710 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 711
712 -def split_words(text):
713 """ Splits some text into words. Includes trailing whitespace 714 on each word when appropriate. """ 715 if not text or not text.strip(): 716 return [] 717 718 words = split_words_re.findall(text) 719 return words
720 721 start_whitespace_re = re.compile(r'^[ \t\n\r]') 722
723 -def start_tag(el):
724 """ 725 The text representation of the start tag for a tag. 726 """ 727 return '<%s%s>' % ( 728 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 729 for name, value in el.attrib.items()]))
730
731 -def end_tag(el):
732 """ The text representation of an end tag for a tag. Includes 733 trailing whitespace when appropriate. """ 734 if el.tail and start_whitespace_re.search(el.tail): 735 extra = ' ' 736 else: 737 extra = '' 738 return '</%s>%s' % (el.tag, extra)
739
740 -def is_word(tok):
741 return not tok.startswith('<')
742
743 -def is_end_tag(tok):
744 return tok.startswith('</')
745
746 -def is_start_tag(tok):
747 return tok.startswith('<') and not tok.startswith('</')
748
749 -def fixup_ins_del_tags(html):
750 """ Given an html string, move any <ins> or <del> tags inside of any 751 block-level elements, e.g. transform <ins><p>word</p></ins> to 752 <p><ins>word</ins></p> """ 753 doc = parse_html(html, cleanup=False) 754 _fixup_ins_del_tags(doc) 755 html = serialize_html_fragment(doc, skip_outer=True) 756 return html
757
758 -def serialize_html_fragment(el, skip_outer=False):
759 """ Serialize a single lxml element as HTML. The serialized form 760 includes the elements tail. 761 762 If skip_outer is true, then don't serialize the outermost tag 763 """ 764 assert not isinstance(el, basestring), ( 765 "You should pass in an element, not a string like %r" % el) 766 html = etree.tostring(el, method="html", encoding=_unicode) 767 if skip_outer: 768 # Get rid of the extra starting tag: 769 html = html[html.find('>')+1:] 770 # Get rid of the extra end tag: 771 html = html[:html.rfind('<')] 772 return html.strip() 773 else: 774 return html
775
776 -def _fixup_ins_del_tags(doc):
777 """fixup_ins_del_tags that works on an lxml document in-place 778 """ 779 for tag in ['ins', 'del']: 780 for el in doc.xpath('descendant-or-self::%s' % tag): 781 if not _contains_block_level_tag(el): 782 continue 783 _move_el_inside_block(el, tag=tag) 784 el.drop_tag()
785 #_merge_element_contents(el) 786
787 -def _contains_block_level_tag(el):
788 """True if the element contains any block-level elements, like <p>, <td>, etc. 789 """ 790 if el.tag in block_level_tags or el.tag in block_level_container_tags: 791 return True 792 for child in el: 793 if _contains_block_level_tag(child): 794 return True 795 return False
796
797 -def _move_el_inside_block(el, tag):
798 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 799 and moves them inside any block-level tags. """ 800 for child in el: 801 if _contains_block_level_tag(child): 802 break 803 else: 804 import sys 805 # No block-level tags in any child 806 children_tag = etree.Element(tag) 807 children_tag.text = el.text 808 el.text = None 809 children_tag.extend(list(el)) 810 el[:] = [children_tag] 811 return 812 for child in list(el): 813 if _contains_block_level_tag(child): 814 _move_el_inside_block(child, tag) 815 if child.tail: 816 tail_tag = etree.Element(tag) 817 tail_tag.text = child.tail 818 child.tail = None 819 el.insert(el.index(child)+1, tail_tag) 820 else: 821 child_tag = etree.Element(tag) 822 el.replace(child, child_tag) 823 child_tag.append(child) 824 if el.text: 825 text_tag = etree.Element(tag) 826 text_tag.text = el.text 827 el.text = None 828 el.insert(0, text_tag)
829
830 -def _merge_element_contents(el):
831 """ 832 Removes an element, but merges its contents into its place, e.g., 833 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 834 <p>Hi there!</p> 835 """ 836 parent = el.getparent() 837 text = el.text or '' 838 if el.tail: 839 if not len(el): 840 text += el.tail 841 else: 842 if el[-1].tail: 843 el[-1].tail += el.tail 844 else: 845 el[-1].tail = el.tail 846 index = parent.index(el) 847 if text: 848 if index == 0: 849 previous = None 850 else: 851 previous = parent[index-1] 852 if previous is None: 853 if parent.text: 854 parent.text += text 855 else: 856 parent.text = text 857 else: 858 if previous.tail: 859 previous.tail += text 860 else: 861 previous.tail = text 862 parent[index:index+1] = el.getchildren()
863
864 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
865 """ 866 Acts like SequenceMatcher, but tries not to find very small equal 867 blocks amidst large spans of changes 868 """ 869 870 threshold = 2 871
872 - def get_matching_blocks(self):
873 size = min(len(self.b), len(self.b)) 874 threshold = min(self.threshold, size / 4) 875 actual = difflib.SequenceMatcher.get_matching_blocks(self) 876 return [item for item in actual 877 if item[2] > threshold 878 or not item[2]]
879 880 if __name__ == '__main__': 881 from lxml.html import _diffcommand 882 _diffcommand.main() 883