Package lxml :: Module doctestcompare
[hide private]
[frames] | no frames]

Source Code for Module lxml.doctestcompare

  1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  Note: normally, you should just import the `lxml.usedoctest` and 
  5  `lxml.html.usedoctest` modules from within a doctest, instead of this 
  6  one:: 
  7   
  8      >>> import lxml.usedoctest # for XML output 
  9   
 10      >>> import lxml.html.usedoctest # for HTML output 
 11   
 12  To use this module directly, you must call ``lxmldoctest.install()``, 
 13  which will cause doctest to use this in all subsequent calls. 
 14   
 15  This changes the way output is checked and comparisons are made for 
 16  XML or HTML-like content. 
 17   
 18  XML or HTML content is noticed because the example starts with ``<`` 
 19  (it's HTML if it starts with ``<html``).  You can also use the 
 20  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 21   
 22  Some rough wildcard-like things are allowed.  Whitespace is generally 
 23  ignored (except in attributes).  In text (attributes and text in the 
 24  body) you can use ``...`` as a wildcard.  In an example it also 
 25  matches any trailing tags in the element, though it does not match 
 26  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 27  attribute in the tag.  An ``any`` tag matches any tag, while the 
 28  attribute matches any and all attributes. 
 29   
 30  When a match fails, the reformatted example and gotten text is 
 31  displayed (indented), and a rough diff-like output is given.  Anything 
 32  marked with ``+`` is in the output but wasn't supposed to be, and 
 33  similarly ``-`` means its in the example but wasn't in the output. 
 34   
 35  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 36  """ 
 37   
 38  from lxml import etree 
 39  import sys 
 40  import re 
 41  import doctest 
 42  try: 
 43      from html import escape as html_escape 
 44  except ImportError: 
 45      from cgi import escape as html_escape 
 46   
 47  __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', 
 48             'LHTMLOutputChecker', 'install', 'temp_install'] 
 49   
 50  try: 
 51      _basestring = basestring 
 52  except NameError: 
 53      _basestring = (str, bytes) 
 54   
 55  _IS_PYTHON_3 = sys.version_info[0] >= 3 
 56   
 57  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 58  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 59  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 60   
 61  OutputChecker = doctest.OutputChecker 
 62   
63 -def strip(v):
64 if v is None: 65 return None 66 else: 67 return v.strip()
68
69 -def norm_whitespace(v):
70 return _norm_whitespace_re.sub(' ', v)
71 72 _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) 73
74 -def html_fromstring(html):
75 return etree.fromstring(html, _html_parser)
76 77 # We use this to distinguish repr()s from elements: 78 _repr_re = re.compile(r'^<[^>]+ (at|object) ') 79 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 80
81 -class LXMLOutputChecker(OutputChecker):
82 83 empty_tags = ( 84 'param', 'img', 'area', 'br', 'basefont', 'input', 85 'base', 'meta', 'link', 'col') 86
87 - def get_default_parser(self):
88 return etree.XML
89
90 - def check_output(self, want, got, optionflags):
91 alt_self = getattr(self, '_temp_override_self', None) 92 if alt_self is not None: 93 super_method = self._temp_call_super_check_output 94 self = alt_self 95 else: 96 super_method = OutputChecker.check_output 97 parser = self.get_parser(want, got, optionflags) 98 if not parser: 99 return super_method( 100 self, want, got, optionflags) 101 try: 102 want_doc = parser(want) 103 except etree.XMLSyntaxError: 104 return False 105 try: 106 got_doc = parser(got) 107 except etree.XMLSyntaxError: 108 return False 109 return self.compare_docs(want_doc, got_doc)
110
111 - def get_parser(self, want, got, optionflags):
112 parser = None 113 if NOPARSE_MARKUP & optionflags: 114 return None 115 if PARSE_HTML & optionflags: 116 parser = html_fromstring 117 elif PARSE_XML & optionflags: 118 parser = etree.XML 119 elif (want.strip().lower().startswith('<html') 120 and got.strip().startswith('<html')): 121 parser = html_fromstring 122 elif (self._looks_like_markup(want) 123 and self._looks_like_markup(got)): 124 parser = self.get_default_parser() 125 return parser
126
127 - def _looks_like_markup(self, s):
128 s = s.strip() 129 return (s.startswith('<') 130 and not _repr_re.search(s))
131
132 - def compare_docs(self, want, got):
133 if not self.tag_compare(want.tag, got.tag): 134 return False 135 if not self.text_compare(want.text, got.text, True): 136 return False 137 if not self.text_compare(want.tail, got.tail, True): 138 return False 139 if 'any' not in want.attrib: 140 want_keys = sorted(want.attrib.keys()) 141 got_keys = sorted(got.attrib.keys()) 142 if want_keys != got_keys: 143 return False 144 for key in want_keys: 145 if not self.text_compare(want.attrib[key], got.attrib[key], False): 146 return False 147 if want.text != '...' or len(want): 148 want_children = list(want) 149 got_children = list(got) 150 while want_children or got_children: 151 if not want_children or not got_children: 152 return False 153 want_first = want_children.pop(0) 154 got_first = got_children.pop(0) 155 if not self.compare_docs(want_first, got_first): 156 return False 157 if not got_children and want_first.tail == '...': 158 break 159 return True
160
161 - def text_compare(self, want, got, strip):
162 want = want or '' 163 got = got or '' 164 if strip: 165 want = norm_whitespace(want).strip() 166 got = norm_whitespace(got).strip() 167 want = '^%s$' % re.escape(want) 168 want = want.replace(r'\.\.\.', '.*') 169 if re.search(want, got): 170 return True 171 else: 172 return False
173
174 - def tag_compare(self, want, got):
175 if want == 'any': 176 return True 177 if (not isinstance(want, _basestring) 178 or not isinstance(got, _basestring)): 179 return want == got 180 want = want or '' 181 got = got or '' 182 if want.startswith('{...}'): 183 # Ellipsis on the namespace 184 return want.split('}')[-1] == got.split('}')[-1] 185 else: 186 return want == got
187
188 - def output_difference(self, example, got, optionflags):
189 want = example.want 190 parser = self.get_parser(want, got, optionflags) 191 errors = [] 192 if parser is not None: 193 try: 194 want_doc = parser(want) 195 except etree.XMLSyntaxError: 196 e = sys.exc_info()[1] 197 errors.append('In example: %s' % e) 198 try: 199 got_doc = parser(got) 200 except etree.XMLSyntaxError: 201 e = sys.exc_info()[1] 202 errors.append('In actual output: %s' % e) 203 if parser is None or errors: 204 value = OutputChecker.output_difference( 205 self, example, got, optionflags) 206 if errors: 207 errors.append(value) 208 return '\n'.join(errors) 209 else: 210 return value 211 html = parser is html_fromstring 212 diff_parts = [] 213 diff_parts.append('Expected:') 214 diff_parts.append(self.format_doc(want_doc, html, 2)) 215 diff_parts.append('Got:') 216 diff_parts.append(self.format_doc(got_doc, html, 2)) 217 diff_parts.append('Diff:') 218 diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 219 return '\n'.join(diff_parts)
220
221 - def html_empty_tag(self, el, html=True):
222 if not html: 223 return False 224 if el.tag not in self.empty_tags: 225 return False 226 if el.text or len(el): 227 # This shouldn't happen (contents in an empty tag) 228 return False 229 return True
230
231 - def format_doc(self, doc, html, indent, prefix=''):
232 parts = [] 233 if not len(doc): 234 # No children... 235 parts.append(' '*indent) 236 parts.append(prefix) 237 parts.append(self.format_tag(doc)) 238 if not self.html_empty_tag(doc, html): 239 if strip(doc.text): 240 parts.append(self.format_text(doc.text)) 241 parts.append(self.format_end_tag(doc)) 242 if strip(doc.tail): 243 parts.append(self.format_text(doc.tail)) 244 parts.append('\n') 245 return ''.join(parts) 246 parts.append(' '*indent) 247 parts.append(prefix) 248 parts.append(self.format_tag(doc)) 249 if not self.html_empty_tag(doc, html): 250 parts.append('\n') 251 if strip(doc.text): 252 parts.append(' '*indent) 253 parts.append(self.format_text(doc.text)) 254 parts.append('\n') 255 for el in doc: 256 parts.append(self.format_doc(el, html, indent+2)) 257 parts.append(' '*indent) 258 parts.append(self.format_end_tag(doc)) 259 parts.append('\n') 260 if strip(doc.tail): 261 parts.append(' '*indent) 262 parts.append(self.format_text(doc.tail)) 263 parts.append('\n') 264 return ''.join(parts)
265
266 - def format_text(self, text, strip=True):
267 if text is None: 268 return '' 269 if strip: 270 text = text.strip() 271 return html_escape(text, 1)
272
273 - def format_tag(self, el):
274 attrs = [] 275 if isinstance(el, etree.CommentBase): 276 # FIXME: probably PIs should be handled specially too? 277 return '<!--' 278 for name, value in sorted(el.attrib.items()): 279 attrs.append('%s="%s"' % (name, self.format_text(value, False))) 280 if not attrs: 281 return '<%s>' % el.tag 282 return '<%s %s>' % (el.tag, ' '.join(attrs))
283
284 - def format_end_tag(self, el):
285 if isinstance(el, etree.CommentBase): 286 # FIXME: probably PIs should be handled specially too? 287 return '-->' 288 return '</%s>' % el.tag
289
290 - def collect_diff(self, want, got, html, indent):
291 parts = [] 292 if not len(want) and not len(got): 293 parts.append(' '*indent) 294 parts.append(self.collect_diff_tag(want, got)) 295 if not self.html_empty_tag(got, html): 296 parts.append(self.collect_diff_text(want.text, got.text)) 297 parts.append(self.collect_diff_end_tag(want, got)) 298 parts.append(self.collect_diff_text(want.tail, got.tail)) 299 parts.append('\n') 300 return ''.join(parts) 301 parts.append(' '*indent) 302 parts.append(self.collect_diff_tag(want, got)) 303 parts.append('\n') 304 if strip(want.text) or strip(got.text): 305 parts.append(' '*indent) 306 parts.append(self.collect_diff_text(want.text, got.text)) 307 parts.append('\n') 308 want_children = list(want) 309 got_children = list(got) 310 while want_children or got_children: 311 if not want_children: 312 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+')) 313 continue 314 if not got_children: 315 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-')) 316 continue 317 parts.append(self.collect_diff( 318 want_children.pop(0), got_children.pop(0), html, indent+2)) 319 parts.append(' '*indent) 320 parts.append(self.collect_diff_end_tag(want, got)) 321 parts.append('\n') 322 if strip(want.tail) or strip(got.tail): 323 parts.append(' '*indent) 324 parts.append(self.collect_diff_text(want.tail, got.tail)) 325 parts.append('\n') 326 return ''.join(parts)
327
328 - def collect_diff_tag(self, want, got):
329 if not self.tag_compare(want.tag, got.tag): 330 tag = '%s (got: %s)' % (want.tag, got.tag) 331 else: 332 tag = got.tag 333 attrs = [] 334 any = want.tag == 'any' or 'any' in want.attrib 335 for name, value in sorted(got.attrib.items()): 336 if name not in want.attrib and not any: 337 attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 338 else: 339 if name in want.attrib: 340 text = self.collect_diff_text(want.attrib[name], value, False) 341 else: 342 text = self.format_text(value, False) 343 attrs.append('%s="%s"' % (name, text)) 344 if not any: 345 for name, value in sorted(want.attrib.items()): 346 if name in got.attrib: 347 continue 348 attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 349 if attrs: 350 tag = '<%s %s>' % (tag, ' '.join(attrs)) 351 else: 352 tag = '<%s>' % tag 353 return tag
354
355 - def collect_diff_end_tag(self, want, got):
356 if want.tag != got.tag: 357 tag = '%s (got: %s)' % (want.tag, got.tag) 358 else: 359 tag = got.tag 360 return '</%s>' % tag
361
362 - def collect_diff_text(self, want, got, strip=True):
363 if self.text_compare(want, got, strip): 364 if not got: 365 return '' 366 return self.format_text(got, strip) 367 text = '%s (got: %s)' % (want, got) 368 return self.format_text(text, strip)
369
370 -class LHTMLOutputChecker(LXMLOutputChecker):
371 - def get_default_parser(self):
372 return html_fromstring
373
374 -def install(html=False):
375 """ 376 Install doctestcompare for all future doctests. 377 378 If html is true, then by default the HTML parser will be used; 379 otherwise the XML parser is used. 380 """ 381 if html: 382 doctest.OutputChecker = LHTMLOutputChecker 383 else: 384 doctest.OutputChecker = LXMLOutputChecker
385
386 -def temp_install(html=False, del_module=None):
387 """ 388 Use this *inside* a doctest to enable this checker for this 389 doctest only. 390 391 If html is true, then by default the HTML parser will be used; 392 otherwise the XML parser is used. 393 """ 394 if html: 395 Checker = LHTMLOutputChecker 396 else: 397 Checker = LXMLOutputChecker 398 frame = _find_doctest_frame() 399 dt_self = frame.f_locals['self'] 400 checker = Checker() 401 old_checker = dt_self._checker 402 dt_self._checker = checker 403 # The unfortunate thing is that there is a local variable 'check' 404 # in the function that runs the doctests, that is a bound method 405 # into the output checker. We have to update that. We can't 406 # modify the frame, so we have to modify the object in place. The 407 # only way to do this is to actually change the func_code 408 # attribute of the method. We change it, and then wait for 409 # __record_outcome to be run, which signals the end of the __run 410 # method, at which point we restore the previous check_output 411 # implementation. 412 if _IS_PYTHON_3: 413 check_func = frame.f_locals['check'].__func__ 414 checker_check_func = checker.check_output.__func__ 415 else: 416 check_func = frame.f_locals['check'].im_func 417 checker_check_func = checker.check_output.im_func 418 # Because we can't patch up func_globals, this is the only global 419 # in check_output that we care about: 420 doctest.etree = etree 421 _RestoreChecker(dt_self, old_checker, checker, 422 check_func, checker_check_func, 423 del_module)
424
425 -class _RestoreChecker(object):
426 - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 427 del_module):
428 self.dt_self = dt_self 429 self.checker = old_checker 430 self.checker._temp_call_super_check_output = self.call_super 431 self.checker._temp_override_self = new_checker 432 self.check_func = check_func 433 self.clone_func = clone_func 434 self.del_module = del_module 435 self.install_clone() 436 self.install_dt_self()
437 - def install_clone(self):
438 if _IS_PYTHON_3: 439 self.func_code = self.check_func.__code__ 440 self.func_globals = self.check_func.__globals__ 441 self.check_func.__code__ = self.clone_func.__code__ 442 else: 443 self.func_code = self.check_func.func_code 444 self.func_globals = self.check_func.func_globals 445 self.check_func.func_code = self.clone_func.func_code
446 - def uninstall_clone(self):
447 if _IS_PYTHON_3: 448 self.check_func.__code__ = self.func_code 449 else: 450 self.check_func.func_code = self.func_code
451 - def install_dt_self(self):
452 self.prev_func = self.dt_self._DocTestRunner__record_outcome 453 self.dt_self._DocTestRunner__record_outcome = self
454 - def uninstall_dt_self(self):
455 self.dt_self._DocTestRunner__record_outcome = self.prev_func
456 - def uninstall_module(self):
457 if self.del_module: 458 import sys 459 del sys.modules[self.del_module] 460 if '.' in self.del_module: 461 package, module = self.del_module.rsplit('.', 1) 462 package_mod = sys.modules[package] 463 delattr(package_mod, module)
464 - def __call__(self, *args, **kw):
465 self.uninstall_clone() 466 self.uninstall_dt_self() 467 del self.checker._temp_override_self 468 del self.checker._temp_call_super_check_output 469 result = self.prev_func(*args, **kw) 470 self.uninstall_module() 471 return result
472 - def call_super(self, *args, **kw):
473 self.uninstall_clone() 474 try: 475 return self.check_func(*args, **kw) 476 finally: 477 self.install_clone()
478
479 -def _find_doctest_frame():
480 import sys 481 frame = sys._getframe(1) 482 while frame: 483 l = frame.f_locals 484 if 'BOOM' in l: 485 # Sign of doctest 486 return frame 487 frame = frame.f_back 488 raise LookupError( 489 "Could not find doctest (only use this function *inside* a doctest)")
490 491 __test__ = { 492 'basic': ''' 493 >>> temp_install() 494 >>> print """<xml a="1" b="2">stuff</xml>""" 495 <xml b="2" a="1">...</xml> 496 >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>""" 497 <xml xmlns="..."> 498 <tag attr="..." /> 499 </xml> 500 >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 501 <xml>...foo /></xml> 502 '''} 503 504 if __name__ == '__main__': 505 import doctest 506 doctest.testmod() 507