Package lxml :: Module doctestcompare
[hide private]
[frames] | no frames]

Source Code for Module lxml.doctestcompare

  1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  Note: normally, you should just import the `lxml.usedoctest` and 
  5  `lxml.html.usedoctest` modules from within a doctest, instead of this 
  6  one:: 
  7   
  8      >>> import lxml.usedoctest # for XML output 
  9   
 10      >>> import lxml.html.usedoctest # for HTML output 
 11   
 12  To use this module directly, you must call ``lxmldoctest.install()``, 
 13  which will cause doctest to use this in all subsequent calls. 
 14   
 15  This changes the way output is checked and comparisons are made for 
 16  XML or HTML-like content. 
 17   
 18  XML or HTML content is noticed because the example starts with ``<`` 
 19  (it's HTML if it starts with ``<html``).  You can also use the 
 20  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 21   
 22  Some rough wildcard-like things are allowed.  Whitespace is generally 
 23  ignored (except in attributes).  In text (attributes and text in the 
 24  body) you can use ``...`` as a wildcard.  In an example it also 
 25  matches any trailing tags in the element, though it does not match 
 26  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 27  attribute in the tag.  An ``any`` tag matches any tag, while the 
 28  attribute matches any and all attributes. 
 29   
 30  When a match fails, the reformatted example and gotten text is 
 31  displayed (indented), and a rough diff-like output is given.  Anything 
 32  marked with ``-`` is in the output but wasn't supposed to be, and 
 33  similarly ``+`` means its in the example but wasn't in the output. 
 34   
 35  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 36  """ 
 37   
 38  from lxml import etree 
 39  import sys 
 40  import re 
 41  import doctest 
 42  import cgi 
 43   
 44  __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', 
 45             'LHTMLOutputChecker', 'install', 'temp_install'] 
 46   
 47  try: 
 48      _basestring = basestring 
 49  except NameError: 
 50      _basestring = (str, bytes) 
 51   
 52  _IS_PYTHON_3 = sys.version_info[0] >= 3 
 53   
 54  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 55  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 56  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 57   
 58  OutputChecker = doctest.OutputChecker 
 59   
60 -def strip(v):
61 if v is None: 62 return None 63 else: 64 return v.strip()
65
66 -def norm_whitespace(v):
67 return _norm_whitespace_re.sub(' ', v)
68 69 _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) 70
71 -def html_fromstring(html):
72 return etree.fromstring(html, _html_parser)
73 74 # We use this to distinguish repr()s from elements: 75 _repr_re = re.compile(r'^<[^>]+ (at|object) ') 76 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 77
78 -class LXMLOutputChecker(OutputChecker):
79 80 empty_tags = ( 81 'param', 'img', 'area', 'br', 'basefont', 'input', 82 'base', 'meta', 'link', 'col') 83
84 - def get_default_parser(self):
85 return etree.XML
86
87 - def check_output(self, want, got, optionflags):
88 alt_self = getattr(self, '_temp_override_self', None) 89 if alt_self is not None: 90 super_method = self._temp_call_super_check_output 91 self = alt_self 92 else: 93 super_method = OutputChecker.check_output 94 parser = self.get_parser(want, got, optionflags) 95 if not parser: 96 return super_method( 97 self, want, got, optionflags) 98 try: 99 want_doc = parser(want) 100 except etree.XMLSyntaxError: 101 return False 102 try: 103 got_doc = parser(got) 104 except etree.XMLSyntaxError: 105 return False 106 return self.compare_docs(want_doc, got_doc)
107
108 - def get_parser(self, want, got, optionflags):
109 parser = None 110 if NOPARSE_MARKUP & optionflags: 111 return None 112 if PARSE_HTML & optionflags: 113 parser = html_fromstring 114 elif PARSE_XML & optionflags: 115 parser = etree.XML 116 elif (want.strip().lower().startswith('<html') 117 and got.strip().startswith('<html')): 118 parser = html_fromstring 119 elif (self._looks_like_markup(want) 120 and self._looks_like_markup(got)): 121 parser = self.get_default_parser() 122 return parser
123
124 - def _looks_like_markup(self, s):
125 s = s.strip() 126 return (s.startswith('<') 127 and not _repr_re.search(s))
128
129 - def compare_docs(self, want, got):
130 if not self.tag_compare(want.tag, got.tag): 131 return False 132 if not self.text_compare(want.text, got.text, True): 133 return False 134 if not self.text_compare(want.tail, got.tail, True): 135 return False 136 if 'any' not in want.attrib: 137 want_keys = sorted(want.attrib.keys()) 138 got_keys = sorted(got.attrib.keys()) 139 if want_keys != got_keys: 140 return False 141 for key in want_keys: 142 if not self.text_compare(want.attrib[key], got.attrib[key], False): 143 return False 144 if want.text != '...' or len(want): 145 want_children = list(want) 146 got_children = list(got) 147 while want_children or got_children: 148 if not want_children or not got_children: 149 return False 150 want_first = want_children.pop(0) 151 got_first = got_children.pop(0) 152 if not self.compare_docs(want_first, got_first): 153 return False 154 if not got_children and want_first.tail == '...': 155 break 156 return True
157
158 - def text_compare(self, want, got, strip):
159 want = want or '' 160 got = got or '' 161 if strip: 162 want = norm_whitespace(want).strip() 163 got = norm_whitespace(got).strip() 164 want = '^%s$' % re.escape(want) 165 want = want.replace(r'\.\.\.', '.*') 166 if re.search(want, got): 167 return True 168 else: 169 return False
170
171 - def tag_compare(self, want, got):
172 if want == 'any': 173 return True 174 if (not isinstance(want, _basestring) 175 or not isinstance(got, _basestring)): 176 return want == got 177 want = want or '' 178 got = got or '' 179 if want.startswith('{...}'): 180 # Ellipsis on the namespace 181 return want.split('}')[-1] == got.split('}')[-1] 182 else: 183 return want == got
184
185 - def output_difference(self, example, got, optionflags):
186 want = example.want 187 parser = self.get_parser(want, got, optionflags) 188 errors = [] 189 if parser is not None: 190 try: 191 want_doc = parser(want) 192 except etree.XMLSyntaxError: 193 e = sys.exc_info()[1] 194 errors.append('In example: %s' % e) 195 try: 196 got_doc = parser(got) 197 except etree.XMLSyntaxError: 198 e = sys.exc_info()[1] 199 errors.append('In actual output: %s' % e) 200 if parser is None or errors: 201 value = OutputChecker.output_difference( 202 self, example, got, optionflags) 203 if errors: 204 errors.append(value) 205 return '\n'.join(errors) 206 else: 207 return value 208 html = parser is html_fromstring 209 diff_parts = [] 210 diff_parts.append('Expected:') 211 diff_parts.append(self.format_doc(want_doc, html, 2)) 212 diff_parts.append('Got:') 213 diff_parts.append(self.format_doc(got_doc, html, 2)) 214 diff_parts.append('Diff:') 215 diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 216 return '\n'.join(diff_parts)
217
218 - def html_empty_tag(self, el, html=True):
219 if not html: 220 return False 221 if el.tag not in self.empty_tags: 222 return False 223 if el.text or len(el): 224 # This shouldn't happen (contents in an empty tag) 225 return False 226 return True
227
228 - def format_doc(self, doc, html, indent, prefix=''):
229 parts = [] 230 if not len(doc): 231 # No children... 232 parts.append(' '*indent) 233 parts.append(prefix) 234 parts.append(self.format_tag(doc)) 235 if not self.html_empty_tag(doc, html): 236 if strip(doc.text): 237 parts.append(self.format_text(doc.text)) 238 parts.append(self.format_end_tag(doc)) 239 if strip(doc.tail): 240 parts.append(self.format_text(doc.tail)) 241 parts.append('\n') 242 return ''.join(parts) 243 parts.append(' '*indent) 244 parts.append(prefix) 245 parts.append(self.format_tag(doc)) 246 if not self.html_empty_tag(doc, html): 247 parts.append('\n') 248 if strip(doc.text): 249 parts.append(' '*indent) 250 parts.append(self.format_text(doc.text)) 251 parts.append('\n') 252 for el in doc: 253 parts.append(self.format_doc(el, html, indent+2)) 254 parts.append(' '*indent) 255 parts.append(self.format_end_tag(doc)) 256 parts.append('\n') 257 if strip(doc.tail): 258 parts.append(' '*indent) 259 parts.append(self.format_text(doc.tail)) 260 parts.append('\n') 261 return ''.join(parts)
262
263 - def format_text(self, text, strip=True):
264 if text is None: 265 return '' 266 if strip: 267 text = text.strip() 268 return cgi.escape(text, 1)
269
270 - def format_tag(self, el):
271 attrs = [] 272 if isinstance(el, etree.CommentBase): 273 # FIXME: probably PIs should be handled specially too? 274 return '<!--' 275 for name, value in sorted(el.attrib.items()): 276 attrs.append('%s="%s"' % (name, self.format_text(value, False))) 277 if not attrs: 278 return '<%s>' % el.tag 279 return '<%s %s>' % (el.tag, ' '.join(attrs))
280
281 - def format_end_tag(self, el):
282 if isinstance(el, etree.CommentBase): 283 # FIXME: probably PIs should be handled specially too? 284 return '-->' 285 return '</%s>' % el.tag
286
287 - def collect_diff(self, want, got, html, indent):
288 parts = [] 289 if not len(want) and not len(got): 290 parts.append(' '*indent) 291 parts.append(self.collect_diff_tag(want, got)) 292 if not self.html_empty_tag(got, html): 293 parts.append(self.collect_diff_text(want.text, got.text)) 294 parts.append(self.collect_diff_end_tag(want, got)) 295 parts.append(self.collect_diff_text(want.tail, got.tail)) 296 parts.append('\n') 297 return ''.join(parts) 298 parts.append(' '*indent) 299 parts.append(self.collect_diff_tag(want, got)) 300 parts.append('\n') 301 if strip(want.text) or strip(got.text): 302 parts.append(' '*indent) 303 parts.append(self.collect_diff_text(want.text, got.text)) 304 parts.append('\n') 305 want_children = list(want) 306 got_children = list(got) 307 while want_children or got_children: 308 if not want_children: 309 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) 310 continue 311 if not got_children: 312 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) 313 continue 314 parts.append(self.collect_diff( 315 want_children.pop(0), got_children.pop(0), html, indent+2)) 316 parts.append(' '*indent) 317 parts.append(self.collect_diff_end_tag(want, got)) 318 parts.append('\n') 319 if strip(want.tail) or strip(got.tail): 320 parts.append(' '*indent) 321 parts.append(self.collect_diff_text(want.tail, got.tail)) 322 parts.append('\n') 323 return ''.join(parts)
324
325 - def collect_diff_tag(self, want, got):
326 if not self.tag_compare(want.tag, got.tag): 327 tag = '%s (got: %s)' % (want.tag, got.tag) 328 else: 329 tag = got.tag 330 attrs = [] 331 any = want.tag == 'any' or 'any' in want.attrib 332 for name, value in sorted(got.attrib.items()): 333 if name not in want.attrib and not any: 334 attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 335 else: 336 if name in want.attrib: 337 text = self.collect_diff_text(value, want.attrib[name], False) 338 else: 339 text = self.format_text(value, False) 340 attrs.append('%s="%s"' % (name, text)) 341 if not any: 342 for name, value in sorted(want.attrib.items()): 343 if name in got.attrib: 344 continue 345 attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 346 if attrs: 347 tag = '<%s %s>' % (tag, ' '.join(attrs)) 348 else: 349 tag = '<%s>' % tag 350 return tag
351
352 - def collect_diff_end_tag(self, want, got):
353 if want.tag != got.tag: 354 tag = '%s (got: %s)' % (want.tag, got.tag) 355 else: 356 tag = got.tag 357 return '</%s>' % tag
358
359 - def collect_diff_text(self, want, got, strip=True):
360 if self.text_compare(want, got, strip): 361 if not got: 362 return '' 363 return self.format_text(got, strip) 364 text = '%s (got: %s)' % (want, got) 365 return self.format_text(text, strip)
366
367 -class LHTMLOutputChecker(LXMLOutputChecker):
368 - def get_default_parser(self):
369 return html_fromstring
370
371 -def install(html=False):
372 """ 373 Install doctestcompare for all future doctests. 374 375 If html is true, then by default the HTML parser will be used; 376 otherwise the XML parser is used. 377 """ 378 if html: 379 doctest.OutputChecker = LHTMLOutputChecker 380 else: 381 doctest.OutputChecker = LXMLOutputChecker
382
383 -def temp_install(html=False, del_module=None):
384 """ 385 Use this *inside* a doctest to enable this checker for this 386 doctest only. 387 388 If html is true, then by default the HTML parser will be used; 389 otherwise the XML parser is used. 390 """ 391 if html: 392 Checker = LHTMLOutputChecker 393 else: 394 Checker = LXMLOutputChecker 395 frame = _find_doctest_frame() 396 dt_self = frame.f_locals['self'] 397 checker = Checker() 398 old_checker = dt_self._checker 399 dt_self._checker = checker 400 # The unfortunate thing is that there is a local variable 'check' 401 # in the function that runs the doctests, that is a bound method 402 # into the output checker. We have to update that. We can't 403 # modify the frame, so we have to modify the object in place. The 404 # only way to do this is to actually change the func_code 405 # attribute of the method. We change it, and then wait for 406 # __record_outcome to be run, which signals the end of the __run 407 # method, at which point we restore the previous check_output 408 # implementation. 409 if _IS_PYTHON_3: 410 check_func = frame.f_locals['check'].__func__ 411 checker_check_func = checker.check_output.__func__ 412 else: 413 check_func = frame.f_locals['check'].im_func 414 checker_check_func = checker.check_output.im_func 415 # Because we can't patch up func_globals, this is the only global 416 # in check_output that we care about: 417 doctest.etree = etree 418 _RestoreChecker(dt_self, old_checker, checker, 419 check_func, checker_check_func, 420 del_module)
421
422 -class _RestoreChecker(object):
423 - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 424 del_module):
425 self.dt_self = dt_self 426 self.checker = old_checker 427 self.checker._temp_call_super_check_output = self.call_super 428 self.checker._temp_override_self = new_checker 429 self.check_func = check_func 430 self.clone_func = clone_func 431 self.del_module = del_module 432 self.install_clone() 433 self.install_dt_self()
434 - def install_clone(self):
435 if _IS_PYTHON_3: 436 self.func_code = self.check_func.__code__ 437 self.func_globals = self.check_func.__globals__ 438 self.check_func.__code__ = self.clone_func.__code__ 439 else: 440 self.func_code = self.check_func.func_code 441 self.func_globals = self.check_func.func_globals 442 self.check_func.func_code = self.clone_func.func_code
443 - def uninstall_clone(self):
444 if _IS_PYTHON_3: 445 self.check_func.__code__ = self.func_code 446 else: 447 self.check_func.func_code = self.func_code
448 - def install_dt_self(self):
449 self.prev_func = self.dt_self._DocTestRunner__record_outcome 450 self.dt_self._DocTestRunner__record_outcome = self
451 - def uninstall_dt_self(self):
452 self.dt_self._DocTestRunner__record_outcome = self.prev_func
453 - def uninstall_module(self):
454 if self.del_module: 455 import sys 456 del sys.modules[self.del_module] 457 if '.' in self.del_module: 458 package, module = self.del_module.rsplit('.', 1) 459 package_mod = sys.modules[package] 460 delattr(package_mod, module)
461 - def __call__(self, *args, **kw):
462 self.uninstall_clone() 463 self.uninstall_dt_self() 464 del self.checker._temp_override_self 465 del self.checker._temp_call_super_check_output 466 result = self.prev_func(*args, **kw) 467 self.uninstall_module() 468 return result
469 - def call_super(self, *args, **kw):
470 self.uninstall_clone() 471 try: 472 return self.check_func(*args, **kw) 473 finally: 474 self.install_clone()
475
476 -def _find_doctest_frame():
477 import sys 478 frame = sys._getframe(1) 479 while frame: 480 l = frame.f_locals 481 if 'BOOM' in l: 482 # Sign of doctest 483 return frame 484 frame = frame.f_back 485 raise LookupError( 486 "Could not find doctest (only use this function *inside* a doctest)")
487 488 __test__ = { 489 'basic': ''' 490 >>> temp_install() 491 >>> print """<xml a="1" b="2">stuff</xml>""" 492 <xml b="2" a="1">...</xml> 493 >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>""" 494 <xml xmlns="..."> 495 <tag attr="..." /> 496 </xml> 497 >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 498 <xml>...foo /></xml> 499 '''} 500 501 if __name__ == '__main__': 502 import doctest 503 doctest.testmod() 504