Package lxml :: Package html :: Module diff
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.diff

  1  # cython: language_level=3 
  2   
  3  from __future__ import absolute_import 
  4   
  5  import difflib 
  6  from lxml import etree 
  7  from lxml.html import fragment_fromstring 
  8  import re 
  9   
 10  __all__ = ['html_annotate', 'htmldiff'] 
 11   
 12  try: 
 13      from html import escape as html_escape 
 14  except ImportError: 
 15      from cgi import escape as html_escape 
 16  try: 
 17      _unicode = unicode 
 18  except NameError: 
 19      # Python 3 
 20      _unicode = str 
 21  try: 
 22      basestring 
 23  except NameError: 
 24      # Python 3 
 25      basestring = str 
 26   
 27  ############################################################ 
 28  ## Annotation 
 29  ############################################################ 
 30   
31 -def default_markup(text, version):
32 return '<span title="%s">%s</span>' % ( 33 html_escape(_unicode(version), 1), text)
34
35 -def html_annotate(doclist, markup=default_markup):
36 """ 37 doclist should be ordered from oldest to newest, like:: 38 39 >>> version1 = 'Hello World' 40 >>> version2 = 'Goodbye World' 41 >>> print(html_annotate([(version1, 'version 1'), 42 ... (version2, 'version 2')])) 43 <span title="version 2">Goodbye</span> <span title="version 1">World</span> 44 45 The documents must be *fragments* (str/UTF8 or unicode), not 46 complete documents 47 48 The markup argument is a function to markup the spans of words. 49 This function is called like markup('Hello', 'version 2'), and 50 returns HTML. The first argument is text and never includes any 51 markup. The default uses a span with a title: 52 53 >>> print(default_markup('Some Text', 'by Joe')) 54 <span title="by Joe">Some Text</span> 55 """ 56 # The basic strategy we have is to split the documents up into 57 # logical tokens (which are words with attached markup). We then 58 # do diffs of each of the versions to track when a token first 59 # appeared in the document; the annotation attached to the token 60 # is the version where it first appeared. 61 tokenlist = [tokenize_annotated(doc, version) 62 for doc, version in doclist] 63 cur_tokens = tokenlist[0] 64 for tokens in tokenlist[1:]: 65 html_annotate_merge_annotations(cur_tokens, tokens) 66 cur_tokens = tokens 67 68 # After we've tracked all the tokens, we can combine spans of text 69 # that are adjacent and have the same annotation 70 cur_tokens = compress_tokens(cur_tokens) 71 # And finally add markup 72 result = markup_serialize_tokens(cur_tokens, markup) 73 return ''.join(result).strip()
74
75 -def tokenize_annotated(doc, annotation):
76 """Tokenize a document and add an annotation attribute to each token 77 """ 78 tokens = tokenize(doc, include_hrefs=False) 79 for tok in tokens: 80 tok.annotation = annotation 81 return tokens 82
83 -def html_annotate_merge_annotations(tokens_old, tokens_new):
84 """Merge the annotations from tokens_old into tokens_new, when the 85 tokens in the new document already existed in the old document. 86 """ 87 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 88 commands = s.get_opcodes() 89 90 for command, i1, i2, j1, j2 in commands: 91 if command == 'equal': 92 eq_old = tokens_old[i1:i2] 93 eq_new = tokens_new[j1:j2] 94 copy_annotations(eq_old, eq_new) 95
96 -def copy_annotations(src, dest):
97 """ 98 Copy annotations from the tokens listed in src to the tokens in dest 99 """ 100 assert len(src) == len(dest) 101 for src_tok, dest_tok in zip(src, dest): 102 dest_tok.annotation = src_tok.annotation 103
104 -def compress_tokens(tokens):
105 """ 106 Combine adjacent tokens when there is no HTML between the tokens, 107 and they share an annotation 108 """ 109 result = [tokens[0]] 110 for tok in tokens[1:]: 111 if (not result[-1].post_tags and 112 not tok.pre_tags and 113 result[-1].annotation == tok.annotation): 114 compress_merge_back(result, tok) 115 else: 116 result.append(tok) 117 return result
118
119 -def compress_merge_back(tokens, tok):
120 """ Merge tok into the last element of tokens (modifying the list of 121 tokens in-place). """ 122 last = tokens[-1] 123 if type(last) is not token or type(tok) is not token: 124 tokens.append(tok) 125 else: 126 text = _unicode(last) 127 if last.trailing_whitespace: 128 text += last.trailing_whitespace 129 text += tok 130 merged = token(text, 131 pre_tags=last.pre_tags, 132 post_tags=tok.post_tags, 133 trailing_whitespace=tok.trailing_whitespace) 134 merged.annotation = last.annotation 135 tokens[-1] = merged 136
137 -def markup_serialize_tokens(tokens, markup_func):
138 """ 139 Serialize the list of tokens into a list of text chunks, calling 140 markup_func around text to add annotations. 141 """ 142 for token in tokens: 143 for pre in token.pre_tags: 144 yield pre 145 html = token.html() 146 html = markup_func(html, token.annotation) 147 if token.trailing_whitespace: 148 html += token.trailing_whitespace 149 yield html 150 for post in token.post_tags: 151 yield post
152 153 154 ############################################################ 155 ## HTML Diffs 156 ############################################################ 157
158 -def htmldiff(old_html, new_html):
159 ## FIXME: this should take parsed documents too, and use their body 160 ## or other content. 161 """ Do a diff of the old and new document. The documents are HTML 162 *fragments* (str/UTF8 or unicode), they are not complete documents 163 (i.e., no <html> tag). 164 165 Returns HTML with <ins> and <del> tags added around the 166 appropriate text. 167 168 Markup is generally ignored, with the markup from new_html 169 preserved, and possibly some markup from old_html (though it is 170 considered acceptable to lose some of the old markup). Only the 171 words in the HTML are diffed. The exception is <img> tags, which 172 are treated like words, and the href attribute of <a> tags, which 173 are noted inside the tag itself when there are changes. 174 """ 175 old_html_tokens = tokenize(old_html) 176 new_html_tokens = tokenize(new_html) 177 result = htmldiff_tokens(old_html_tokens, new_html_tokens) 178 result = ''.join(result).strip() 179 return fixup_ins_del_tags(result)
180
181 -def htmldiff_tokens(html1_tokens, html2_tokens):
182 """ Does a diff on the tokens themselves, returning a list of text 183 chunks (not tokens). 184 """ 185 # There are several passes as we do the differences. The tokens 186 # isolate the portion of the content we care to diff; difflib does 187 # all the actual hard work at that point. 188 # 189 # Then we must create a valid document from pieces of both the old 190 # document and the new document. We generally prefer to take 191 # markup from the new document, and only do a best effort attempt 192 # to keep markup from the old document; anything that we can't 193 # resolve we throw away. Also we try to put the deletes as close 194 # to the location where we think they would have been -- because 195 # we are only keeping the markup from the new document, it can be 196 # fuzzy where in the new document the old text would have gone. 197 # Again we just do a best effort attempt. 198 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 199 commands = s.get_opcodes() 200 result = [] 201 for command, i1, i2, j1, j2 in commands: 202 if command == 'equal': 203 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 204 continue 205 if command == 'insert' or command == 'replace': 206 ins_tokens = expand_tokens(html2_tokens[j1:j2]) 207 merge_insert(ins_tokens, result) 208 if command == 'delete' or command == 'replace': 209 del_tokens = expand_tokens(html1_tokens[i1:i2]) 210 merge_delete(del_tokens, result) 211 # If deletes were inserted directly as <del> then we'd have an 212 # invalid document at this point. Instead we put in special 213 # markers, and when the complete diffed document has been created 214 # we try to move the deletes around and resolve any problems. 215 result = cleanup_delete(result) 216 217 return result
218
219 -def expand_tokens(tokens, equal=False):
220 """Given a list of tokens, return a generator of the chunks of 221 text for the data in the tokens. 222 """ 223 for token in tokens: 224 for pre in token.pre_tags: 225 yield pre 226 if not equal or not token.hide_when_equal: 227 if token.trailing_whitespace: 228 yield token.html() + token.trailing_whitespace 229 else: 230 yield token.html() 231 for post in token.post_tags: 232 yield post
233
234 -def merge_insert(ins_chunks, doc):
235 """ doc is the already-handled document (as a list of text chunks); 236 here we add <ins>ins_chunks</ins> to the end of that. """ 237 # Though we don't throw away unbalanced_start or unbalanced_end 238 # (we assume there is accompanying markup later or earlier in the 239 # document), we only put <ins> around the balanced portion. 240 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 241 doc.extend(unbalanced_start) 242 if doc and not doc[-1].endswith(' '): 243 # Fix up the case where the word before the insert didn't end with 244 # a space 245 doc[-1] += ' ' 246 doc.append('<ins>') 247 if balanced and balanced[-1].endswith(' '): 248 # We move space outside of </ins> 249 balanced[-1] = balanced[-1][:-1] 250 doc.extend(balanced) 251 doc.append('</ins> ') 252 doc.extend(unbalanced_end)
253 254 # These are sentinals to represent the start and end of a <del> 255 # segment, until we do the cleanup phase to turn them into proper 256 # markup:
257 -class DEL_START:
258 pass
259 -class DEL_END:
260 pass
261
262 -class NoDeletes(Exception):
263 """ Raised when the document no longer contains any pending deletes 264 (DEL_START/DEL_END) """
265
266 -def merge_delete(del_chunks, doc):
267 """ Adds the text chunks in del_chunks to the document doc (another 268 list of text chunks) with marker to show it is a delete. 269 cleanup_delete later resolves these markers into <del> tags.""" 270 doc.append(DEL_START) 271 doc.extend(del_chunks) 272 doc.append(DEL_END)
273
274 -def cleanup_delete(chunks):
275 """ Cleans up any DEL_START/DEL_END markers in the document, replacing 276 them with <del></del>. To do this while keeping the document 277 valid, it may need to drop some tags (either start or end tags). 278 279 It may also move the del into adjacent tags to try to move it to a 280 similar location where it was originally located (e.g., moving a 281 delete into preceding <div> tag, if the del looks like (DEL_START, 282 'Text</div>', DEL_END)""" 283 while 1: 284 # Find a pending DEL_START/DEL_END, splitting the document 285 # into stuff-preceding-DEL_START, stuff-inside, and 286 # stuff-following-DEL_END 287 try: 288 pre_delete, delete, post_delete = split_delete(chunks) 289 except NoDeletes: 290 # Nothing found, we've cleaned up the entire doc 291 break 292 # The stuff-inside-DEL_START/END may not be well balanced 293 # markup. First we figure out what unbalanced portions there are: 294 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 295 # Then we move the span forward and/or backward based on these 296 # unbalanced portions: 297 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 298 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 299 doc = pre_delete 300 if doc and not doc[-1].endswith(' '): 301 # Fix up case where the word before us didn't have a trailing space 302 doc[-1] += ' ' 303 doc.append('<del>') 304 if balanced and balanced[-1].endswith(' '): 305 # We move space outside of </del> 306 balanced[-1] = balanced[-1][:-1] 307 doc.extend(balanced) 308 doc.append('</del> ') 309 doc.extend(post_delete) 310 chunks = doc 311 return chunks
312
313 -def split_unbalanced(chunks):
314 """Return (unbalanced_start, balanced, unbalanced_end), where each is 315 a list of text and tag chunks. 316 317 unbalanced_start is a list of all the tags that are opened, but 318 not closed in this span. Similarly, unbalanced_end is a list of 319 tags that are closed but were not opened. Extracting these might 320 mean some reordering of the chunks.""" 321 start = [] 322 end = [] 323 tag_stack = [] 324 balanced = [] 325 for chunk in chunks: 326 if not chunk.startswith('<'): 327 balanced.append(chunk) 328 continue 329 endtag = chunk[1] == '/' 330 name = chunk.split()[0].strip('<>/') 331 if name in empty_tags: 332 balanced.append(chunk) 333 continue 334 if endtag: 335 if tag_stack and tag_stack[-1][0] == name: 336 balanced.append(chunk) 337 name, pos, tag = tag_stack.pop() 338 balanced[pos] = tag 339 elif tag_stack: 340 start.extend([tag for name, pos, tag in tag_stack]) 341 tag_stack = [] 342 end.append(chunk) 343 else: 344 end.append(chunk) 345 else: 346 tag_stack.append((name, len(balanced), chunk)) 347 balanced.append(None) 348 start.extend( 349 [chunk for name, pos, chunk in tag_stack]) 350 balanced = [chunk for chunk in balanced if chunk is not None] 351 return start, balanced, end
352
353 -def split_delete(chunks):
354 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 355 stuff_after_DEL_END). Returns the first case found (there may be 356 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if 357 there's no DEL_START found. """ 358 try: 359 pos = chunks.index(DEL_START) 360 except ValueError: 361 raise NoDeletes 362 pos2 = chunks.index(DEL_END) 363 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
364
365 -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
366 """ pre_delete and post_delete implicitly point to a place in the 367 document (where the two were split). This moves that point (by 368 popping items from one and pushing them onto the other). It moves 369 the point to try to find a place where unbalanced_start applies. 370 371 As an example:: 372 373 >>> unbalanced_start = ['<div>'] 374 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 375 >>> pre, post = doc[:3], doc[3:] 376 >>> pre, post 377 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 378 >>> locate_unbalanced_start(unbalanced_start, pre, post) 379 >>> pre, post 380 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 381 382 As you can see, we moved the point so that the dangling <div> that 383 we found will be effectively replaced by the div in the original 384 document. If this doesn't work out, we just throw away 385 unbalanced_start without doing anything. 386 """ 387 while 1: 388 if not unbalanced_start: 389 # We have totally succeeded in finding the position 390 break 391 finding = unbalanced_start[0] 392 finding_name = finding.split()[0].strip('<>') 393 if not post_delete: 394 break 395 next = post_delete[0] 396 if next is DEL_START or not next.startswith('<'): 397 # Reached a word, we can't move the delete text forward 398 break 399 if next[1] == '/': 400 # Reached a closing tag, can we go further? Maybe not... 401 break 402 name = next.split()[0].strip('<>') 403 if name == 'ins': 404 # Can't move into an insert 405 break 406 assert name != 'del', ( 407 "Unexpected delete tag: %r" % next) 408 if name == finding_name: 409 unbalanced_start.pop(0) 410 pre_delete.append(post_delete.pop(0)) 411 else: 412 # Found a tag that doesn't match 413 break
414
415 -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
416 """ like locate_unbalanced_start, except handling end tags and 417 possibly moving the point earlier in the document. """ 418 while 1: 419 if not unbalanced_end: 420 # Success 421 break 422 finding = unbalanced_end[-1] 423 finding_name = finding.split()[0].strip('<>/') 424 if not pre_delete: 425 break 426 next = pre_delete[-1] 427 if next is DEL_END or not next.startswith('</'): 428 # A word or a start tag 429 break 430 name = next.split()[0].strip('<>/') 431 if name == 'ins' or name == 'del': 432 # Can't move into an insert or delete 433 break 434 if name == finding_name: 435 unbalanced_end.pop() 436 post_delete.insert(0, pre_delete.pop()) 437 else: 438 # Found a tag that doesn't match 439 break
440
441 -class token(_unicode):
442 """ Represents a diffable token, generally a word that is displayed to 443 the user. Opening tags are attached to this token when they are 444 adjacent (pre_tags) and closing tags that follow the word 445 (post_tags). Some exceptions occur when there are empty tags 446 adjacent to a word, so there may be close tags in pre_tags, or 447 open tags in post_tags. 448 449 We also keep track of whether the word was originally followed by 450 whitespace, even though we do not want to treat the word as 451 equivalent to a similar word that does not have a trailing 452 space.""" 453 454 # When this is true, the token will be eliminated from the 455 # displayed diff if no change has occurred: 456 hide_when_equal = False 457
458 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
459 obj = _unicode.__new__(cls, text) 460 461 if pre_tags is not None: 462 obj.pre_tags = pre_tags 463 else: 464 obj.pre_tags = [] 465 466 if post_tags is not None: 467 obj.post_tags = post_tags 468 else: 469 obj.post_tags = [] 470 471 obj.trailing_whitespace = trailing_whitespace 472 473 return obj
474
475 - def __repr__(self):
476 return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, 477 self.post_tags, self.trailing_whitespace)
478
479 - def html(self):
480 return _unicode(self)
481
482 -class tag_token(token):
483 484 """ Represents a token that is actually a tag. Currently this is just 485 the <img> tag, which takes up visible space just like a word but 486 is only represented in a document by a tag. """ 487
488 - def __new__(cls, tag, data, html_repr, pre_tags=None, 489 post_tags=None, trailing_whitespace=""):
490 obj = token.__new__(cls, "%s: %s" % (type, data), 491 pre_tags=pre_tags, 492 post_tags=post_tags, 493 trailing_whitespace=trailing_whitespace) 494 obj.tag = tag 495 obj.data = data 496 obj.html_repr = html_repr 497 return obj
498
499 - def __repr__(self):
500 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % ( 501 self.tag, 502 self.data, 503 self.html_repr, 504 self.pre_tags, 505 self.post_tags, 506 self.trailing_whitespace)
507 - def html(self):
508 return self.html_repr
509
510 -class href_token(token):
511 512 """ Represents the href in an anchor tag. Unlike other words, we only 513 show the href when it changes. """ 514 515 hide_when_equal = True 516
517 - def html(self):
518 return ' Link: %s' % self
519
520 -def tokenize(html, include_hrefs=True):
521 """ 522 Parse the given HTML and returns token objects (words with attached tags). 523 524 This parses only the content of a page; anything in the head is 525 ignored, and the <head> and <body> elements are themselves 526 optional. The content is then parsed by lxml, which ensures the 527 validity of the resulting parsed document (though lxml may make 528 incorrect guesses when the markup is particular bad). 529 530 <ins> and <del> tags are also eliminated from the document, as 531 that gets confusing. 532 533 If include_hrefs is true, then the href attribute of <a> tags is 534 included as a special kind of diffable token.""" 535 if etree.iselement(html): 536 body_el = html 537 else: 538 body_el = parse_html(html, cleanup=True) 539 # Then we split the document into text chunks for each tag, word, and end tag: 540 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 541 # Finally re-joining them into token objects: 542 return fixup_chunks(chunks)
543
544 -def parse_html(html, cleanup=True):
545 """ 546 Parses an HTML fragment, returning an lxml element. Note that the HTML will be 547 wrapped in a <div> tag that was not in the original document. 548 549 If cleanup is true, make sure there's no <head> or <body>, and get 550 rid of any <ins> and <del> tags. 551 """ 552 if cleanup: 553 # This removes any extra markup or structure like <head>: 554 html = cleanup_html(html) 555 return fragment_fromstring(html, create_parent=True)
556 557 _body_re = re.compile(r'<body.*?>', re.I|re.S) 558 _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 559 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 560
561 -def cleanup_html(html):
562 """ This 'cleans' the HTML, meaning that any page structure is removed 563 (only the contents of <body> are used, if there is any <body). 564 Also <ins> and <del> tags are removed. """ 565 match = _body_re.search(html) 566 if match: 567 html = html[match.end():] 568 match = _end_body_re.search(html) 569 if match: 570 html = html[:match.start()] 571 html = _ins_del_re.sub('', html) 572 return html
573 574 575 end_whitespace_re = re.compile(r'[ \t\n\r]$') 576
577 -def split_trailing_whitespace(word):
578 """ 579 This function takes a word, such as 'test\n\n' and returns ('test','\n\n') 580 """ 581 stripped_length = len(word.rstrip()) 582 return word[0:stripped_length], word[stripped_length:]
583 584
585 -def fixup_chunks(chunks):
586 """ 587 This function takes a list of chunks and produces a list of tokens. 588 """ 589 tag_accum = [] 590 cur_word = None 591 result = [] 592 for chunk in chunks: 593 if isinstance(chunk, tuple): 594 if chunk[0] == 'img': 595 src = chunk[1] 596 tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) 597 cur_word = tag_token('img', src, html_repr=tag, 598 pre_tags=tag_accum, 599 trailing_whitespace=trailing_whitespace) 600 tag_accum = [] 601 result.append(cur_word) 602 603 elif chunk[0] == 'href': 604 href = chunk[1] 605 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") 606 tag_accum = [] 607 result.append(cur_word) 608 continue 609 610 if is_word(chunk): 611 chunk, trailing_whitespace = split_trailing_whitespace(chunk) 612 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 613 tag_accum = [] 614 result.append(cur_word) 615 616 elif is_start_tag(chunk): 617 tag_accum.append(chunk) 618 619 elif is_end_tag(chunk): 620 if tag_accum: 621 tag_accum.append(chunk) 622 else: 623 assert cur_word, ( 624 "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 625 % (cur_word, result, chunk, chunks)) 626 cur_word.post_tags.append(chunk) 627 else: 628 assert False 629 630 if not result: 631 return [token('', pre_tags=tag_accum)] 632 else: 633 result[-1].post_tags.extend(tag_accum) 634 635 return result
636 637 638 # All the tags in HTML that don't require end tags: 639 empty_tags = ( 640 'param', 'img', 'area', 'br', 'basefont', 'input', 641 'base', 'meta', 'link', 'col') 642 643 block_level_tags = ( 644 'address', 645 'blockquote', 646 'center', 647 'dir', 648 'div', 649 'dl', 650 'fieldset', 651 'form', 652 'h1', 653 'h2', 654 'h3', 655 'h4', 656 'h5', 657 'h6', 658 'hr', 659 'isindex', 660 'menu', 661 'noframes', 662 'noscript', 663 'ol', 664 'p', 665 'pre', 666 'table', 667 'ul', 668 ) 669 670 block_level_container_tags = ( 671 'dd', 672 'dt', 673 'frameset', 674 'li', 675 'tbody', 676 'td', 677 'tfoot', 678 'th', 679 'thead', 680 'tr', 681 ) 682 683
684 -def flatten_el(el, include_hrefs, skip_tag=False):
685 """ Takes an lxml element el, and generates all the text chunks for 686 that tag. Each start tag is a chunk, each word is a chunk, and each 687 end tag is a chunk. 688 689 If skip_tag is true, then the outermost container tag is 690 not returned (just its contents).""" 691 if not skip_tag: 692 if el.tag == 'img': 693 yield ('img', el.get('src'), start_tag(el)) 694 else: 695 yield start_tag(el) 696 if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 697 return 698 start_words = split_words(el.text) 699 for word in start_words: 700 yield html_escape(word) 701 for child in el: 702 for item in flatten_el(child, include_hrefs=include_hrefs): 703 yield item 704 if el.tag == 'a' and el.get('href') and include_hrefs: 705 yield ('href', el.get('href')) 706 if not skip_tag: 707 yield end_tag(el) 708 end_words = split_words(el.tail) 709 for word in end_words: 710 yield html_escape(word)
711 712 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U) 713
714 -def split_words(text):
715 """ Splits some text into words. Includes trailing whitespace 716 on each word when appropriate. """ 717 if not text or not text.strip(): 718 return [] 719 720 words = split_words_re.findall(text) 721 return words
722 723 start_whitespace_re = re.compile(r'^[ \t\n\r]') 724
725 -def start_tag(el):
726 """ 727 The text representation of the start tag for a tag. 728 """ 729 return '<%s%s>' % ( 730 el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True)) 731 for name, value in el.attrib.items()]))
732
733 -def end_tag(el):
734 """ The text representation of an end tag for a tag. Includes 735 trailing whitespace when appropriate. """ 736 if el.tail and start_whitespace_re.search(el.tail): 737 extra = ' ' 738 else: 739 extra = '' 740 return '</%s>%s' % (el.tag, extra)
741
742 -def is_word(tok):
743 return not tok.startswith('<')
744
745 -def is_end_tag(tok):
746 return tok.startswith('</')
747
748 -def is_start_tag(tok):
749 return tok.startswith('<') and not tok.startswith('</')
750
751 -def fixup_ins_del_tags(html):
752 """ Given an html string, move any <ins> or <del> tags inside of any 753 block-level elements, e.g. transform <ins><p>word</p></ins> to 754 <p><ins>word</ins></p> """ 755 doc = parse_html(html, cleanup=False) 756 _fixup_ins_del_tags(doc) 757 html = serialize_html_fragment(doc, skip_outer=True) 758 return html
759
760 -def serialize_html_fragment(el, skip_outer=False):
761 """ Serialize a single lxml element as HTML. The serialized form 762 includes the elements tail. 763 764 If skip_outer is true, then don't serialize the outermost tag 765 """ 766 assert not isinstance(el, basestring), ( 767 "You should pass in an element, not a string like %r" % el) 768 html = etree.tostring(el, method="html", encoding=_unicode) 769 if skip_outer: 770 # Get rid of the extra starting tag: 771 html = html[html.find('>')+1:] 772 # Get rid of the extra end tag: 773 html = html[:html.rfind('<')] 774 return html.strip() 775 else: 776 return html
777
778 -def _fixup_ins_del_tags(doc):
779 """fixup_ins_del_tags that works on an lxml document in-place 780 """ 781 for tag in ['ins', 'del']: 782 for el in doc.xpath('descendant-or-self::%s' % tag): 783 if not _contains_block_level_tag(el): 784 continue 785 _move_el_inside_block(el, tag=tag) 786 el.drop_tag()
787 #_merge_element_contents(el) 788
789 -def _contains_block_level_tag(el):
790 """True if the element contains any block-level elements, like <p>, <td>, etc. 791 """ 792 if el.tag in block_level_tags or el.tag in block_level_container_tags: 793 return True 794 for child in el: 795 if _contains_block_level_tag(child): 796 return True 797 return False
798
799 -def _move_el_inside_block(el, tag):
800 """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 801 and moves them inside any block-level tags. """ 802 for child in el: 803 if _contains_block_level_tag(child): 804 break 805 else: 806 # No block-level tags in any child 807 children_tag = etree.Element(tag) 808 children_tag.text = el.text 809 el.text = None 810 children_tag.extend(list(el)) 811 el[:] = [children_tag] 812 return 813 for child in list(el): 814 if _contains_block_level_tag(child): 815 _move_el_inside_block(child, tag) 816 if child.tail: 817 tail_tag = etree.Element(tag) 818 tail_tag.text = child.tail 819 child.tail = None 820 el.insert(el.index(child)+1, tail_tag) 821 else: 822 child_tag = etree.Element(tag) 823 el.replace(child, child_tag) 824 child_tag.append(child) 825 if el.text: 826 text_tag = etree.Element(tag) 827 text_tag.text = el.text 828 el.text = None 829 el.insert(0, text_tag)
830
831 -def _merge_element_contents(el):
832 """ 833 Removes an element, but merges its contents into its place, e.g., 834 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 835 <p>Hi there!</p> 836 """ 837 parent = el.getparent() 838 text = el.text or '' 839 if el.tail: 840 if not len(el): 841 text += el.tail 842 else: 843 if el[-1].tail: 844 el[-1].tail += el.tail 845 else: 846 el[-1].tail = el.tail 847 index = parent.index(el) 848 if text: 849 if index == 0: 850 previous = None 851 else: 852 previous = parent[index-1] 853 if previous is None: 854 if parent.text: 855 parent.text += text 856 else: 857 parent.text = text 858 else: 859 if previous.tail: 860 previous.tail += text 861 else: 862 previous.tail = text 863 parent[index:index+1] = el.getchildren()
864
865 -class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
866 """ 867 Acts like SequenceMatcher, but tries not to find very small equal 868 blocks amidst large spans of changes 869 """ 870 871 threshold = 2 872
873 - def get_matching_blocks(self):
874 size = min(len(self.b), len(self.b)) 875 threshold = min(self.threshold, size / 4) 876 actual = difflib.SequenceMatcher.get_matching_blocks(self) 877 return [item for item in actual 878 if item[2] > threshold 879 or not item[2]]
880 881 if __name__ == '__main__': 882 from lxml.html import _diffcommand 883 _diffcommand.main() 884