Package lxml :: Module doctestcompare
[frames] | no frames]

Source Code for Module lxml.doctestcompare

  1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  Note: normally, you should just import the `lxml.usedoctest` and 
  5  `lxml.html.usedoctest` modules from within a doctest, instead of this 
  6  one:: 
  7   
  8      >>> import lxml.usedoctest # for XML output 
  9   
 10      >>> import lxml.html.usedoctest # for HTML output 
 11   
 12  To use this module directly, you must call ``lxmldoctest.install()``, 
 13  which will cause doctest to use this in all subsequent calls. 
 14   
 15  This changes the way output is checked and comparisons are made for 
 16  XML or HTML-like content. 
 17   
 18  XML or HTML content is noticed because the example starts with ``<`` 
 19  (it's HTML if it starts with ``<html``).  You can also use the 
 20  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 21   
 22  Some rough wildcard-like things are allowed.  Whitespace is generally 
 23  ignored (except in attributes).  In text (attributes and text in the 
 24  body) you can use ``...`` as a wildcard.  In an example it also 
 25  matches any trailing tags in the element, though it does not match 
 26  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 27  attribute in the tag.  An ``any`` tag matches any tag, while the 
 28  attribute matches any and all attributes. 
 29   
 30  When a match fails, the reformatted example and gotten text is 
 31  displayed (indented), and a rough diff-like output is given.  Anything 
 32  marked with ``-`` is in the output but wasn't supposed to be, and 
 33  similarly ``+`` means its in the example but wasn't in the output. 
 34   
 35  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 36  """ 
 37   
 38  from lxml import etree 
 39  import re 
 40  import doctest 
 41  import cgi 
 42   
 43  __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', 
 44             'LHTMLOutputChecker', 'install', 'temp_install'] 
 45   
 46  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 47  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 48  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 49   
 50  OutputChecker = doctest.OutputChecker 
 51   
52 -def strip(v):
53 if v is None: 54 return None 55 else: 56 return v.strip()
57
58 -def norm_whitespace(v):
59 return _norm_whitespace_re.sub(' ', v)
60 61 _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) 62
63 -def html_fromstring(html):
64 return etree.fromstring(html, _html_parser)
65 66 # We use this to distinguish repr()s from elements: 67 _repr_re = re.compile(r'^<[^>]+ (at|object) ') 68 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 69
70 -class LXMLOutputChecker(OutputChecker):
71 72 empty_tags = ( 73 'param', 'img', 'area', 'br', 'basefont', 'input', 74 'base', 'meta', 'link', 'col') 75
76 - def get_default_parser(self):
77 return etree.XML
78
79 - def check_output(self, want, got, optionflags):
80 alt_self = getattr(self, '_temp_override_self', None) 81 if alt_self is not None: 82 super_method = self._temp_call_super_check_output 83 self = alt_self 84 else: 85 super_method = OutputChecker.check_output 86 parser = self.get_parser(want, got, optionflags) 87 if not parser: 88 return super_method( 89 self, want, got, optionflags) 90 try: 91 want_doc = parser(want) 92 except etree.XMLSyntaxError: 93 return False 94 try: 95 got_doc = parser(got) 96 except etree.XMLSyntaxError: 97 return False 98 return self.compare_docs(want_doc, got_doc)
99
100 - def get_parser(self, want, got, optionflags):
101 parser = None 102 if NOPARSE_MARKUP & optionflags: 103 return None 104 if PARSE_HTML & optionflags: 105 parser = html_fromstring 106 elif PARSE_XML & optionflags: 107 parser = etree.XML 108 elif (want.strip().lower().startswith('<html') 109 and got.strip().startswith('<html')): 110 parser = html_fromstring 111 elif (self._looks_like_markup(want) 112 and self._looks_like_markup(got)): 113 parser = self.get_default_parser() 114 return parser
115
116 - def _looks_like_markup(self, s):
117 s = s.strip() 118 return (s.startswith('<') 119 and not _repr_re.search(s))
120
121 - def compare_docs(self, want, got):
122 if not self.tag_compare(want.tag, got.tag): 123 return False 124 if not self.text_compare(want.text, got.text, True): 125 return False 126 if not self.text_compare(want.tail, got.tail, True): 127 return False 128 if 'any' not in want.attrib: 129 want_keys = sorted(want.attrib.keys()) 130 got_keys = sorted(got.attrib.keys()) 131 if want_keys != got_keys: 132 return False 133 for key in want_keys: 134 if not self.text_compare(want.attrib[key], got.attrib[key], False): 135 return False 136 if want.text != '...' or len(want): 137 want_children = list(want) 138 got_children = list(got) 139 while want_children or got_children: 140 if not want_children or not got_children: 141 return False 142 want_first = want_children.pop(0) 143 got_first = got_children.pop(0) 144 if not self.compare_docs(want_first, got_first): 145 return False 146 if not got_children and want_first.tail == '...': 147 break 148 return True
149
150 - def text_compare(self, want, got, strip):
151 want = want or '' 152 got = got or '' 153 if strip: 154 want = norm_whitespace(want).strip() 155 got = norm_whitespace(got).strip() 156 want = '^%s$' % re.escape(want) 157 want = want.replace(r'\.\.\.', '.*') 158 if re.search(want, got): 159 return True 160 else: 161 return False
162
163 - def tag_compare(self, want, got):
164 if want == 'any': 165 return True 166 if (not isinstance(want, basestring) 167 or not isinstance(got, basestring)): 168 return want == got 169 want = want or '' 170 got = got or '' 171 if want.startswith('{...}'): 172 # Ellipsis on the namespace 173 return want.split('}')[-1] == got.split('}')[-1] 174 else: 175 return want == got
176
177 - def output_difference(self, example, got, optionflags):
178 want = example.want 179 parser = self.get_parser(want, got, optionflags) 180 errors = [] 181 if parser is not None: 182 try: 183 want_doc = parser(want) 184 except etree.XMLSyntaxError, e: 185 errors.append('In example: %s' % e) 186 try: 187 got_doc = parser(got) 188 except etree.XMLSyntaxError, e: 189 errors.append('In actual output: %s' % e) 190 if parser is None or errors: 191 value = OutputChecker.output_difference( 192 self, example, got, optionflags) 193 if errors: 194 errors.append(value) 195 return '\n'.join(errors) 196 else: 197 return value 198 html = parser is html_fromstring 199 diff_parts = [] 200 diff_parts.append('Expected:') 201 diff_parts.append(self.format_doc(want_doc, html, 2)) 202 diff_parts.append('Got:') 203 diff_parts.append(self.format_doc(got_doc, html, 2)) 204 diff_parts.append('Diff:') 205 diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 206 return '\n'.join(diff_parts)
207
208 - def html_empty_tag(self, el, html=True):
209 if not html: 210 return False 211 if el.tag not in self.empty_tags: 212 return False 213 if el.text or len(el): 214 # This shouldn't happen (contents in an empty tag) 215 return False 216 return True
217
218 - def format_doc(self, doc, html, indent, prefix=''):
219 parts = [] 220 if not len(doc): 221 # No children... 222 parts.append(' '*indent) 223 parts.append(prefix) 224 parts.append(self.format_tag(doc)) 225 if not self.html_empty_tag(doc, html): 226 if strip(doc.text): 227 parts.append(self.format_text(doc.text)) 228 parts.append(self.format_end_tag(doc)) 229 if strip(doc.tail): 230 parts.append(self.format_text(doc.tail)) 231 parts.append('\n') 232 return ''.join(parts) 233 parts.append(' '*indent) 234 parts.append(prefix) 235 parts.append(self.format_tag(doc)) 236 if not self.html_empty_tag(doc, html): 237 parts.append('\n') 238 if strip(doc.text): 239 parts.append(' '*indent) 240 parts.append(self.format_text(doc.text)) 241 parts.append('\n') 242 for el in doc: 243 parts.append(self.format_doc(el, html, indent+2)) 244 parts.append(' '*indent) 245 parts.append(self.format_end_tag(doc)) 246 parts.append('\n') 247 if strip(doc.tail): 248 parts.append(' '*indent) 249 parts.append(self.format_text(doc.tail)) 250 parts.append('\n') 251 return ''.join(parts)
252
253 - def format_text(self, text, strip=True):
254 if text is None: 255 return '' 256 if strip: 257 text = text.strip() 258 return cgi.escape(text, 1)
259
260 - def format_tag(self, el):
261 attrs = [] 262 if isinstance(el, etree.CommentBase): 263 # FIXME: probably PIs should be handled specially too? 264 return '<!--' 265 for name, value in sorted(el.attrib.items()): 266 attrs.append('%s="%s"' % (name, self.format_text(value, False))) 267 if not attrs: 268 return '<%s>' % el.tag 269 return '<%s %s>' % (el.tag, ' '.join(attrs))
270
271 - def format_end_tag(self, el):
272 if isinstance(el, etree.CommentBase): 273 # FIXME: probably PIs should be handled specially too? 274 return '-->' 275 return '</%s>' % el.tag
276
277 - def collect_diff(self, want, got, html, indent):
278 parts = [] 279 if not len(want) and not len(got): 280 parts.append(' '*indent) 281 parts.append(self.collect_diff_tag(want, got)) 282 if not self.html_empty_tag(got, html): 283 parts.append(self.collect_diff_text(want.text, got.text)) 284 parts.append(self.collect_diff_end_tag(want, got)) 285 parts.append(self.collect_diff_text(want.tail, got.tail)) 286 parts.append('\n') 287 return ''.join(parts) 288 parts.append(' '*indent) 289 parts.append(self.collect_diff_tag(want, got)) 290 parts.append('\n') 291 if strip(want.text) or strip(got.text): 292 parts.append(' '*indent) 293 parts.append(self.collect_diff_text(want.text, got.text)) 294 parts.append('\n') 295 want_children = list(want) 296 got_children = list(got) 297 while want_children or got_children: 298 if not want_children: 299 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) 300 continue 301 if not got_children: 302 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) 303 continue 304 parts.append(self.collect_diff( 305 want_children.pop(0), got_children.pop(0), html, indent+2)) 306 parts.append(' '*indent) 307 parts.append(self.collect_diff_end_tag(want, got)) 308 parts.append('\n') 309 if strip(want.tail) or strip(got.tail): 310 parts.append(' '*indent) 311 parts.append(self.collect_diff_text(want.tail, got.tail)) 312 parts.append('\n') 313 return ''.join(parts)
314
315 - def collect_diff_tag(self, want, got):
316 if not self.tag_compare(want.tag, got.tag): 317 tag = '%s (got: %s)' % (want.tag, got.tag) 318 else: 319 tag = got.tag 320 attrs = [] 321 any = want.tag == 'any' or 'any' in want.attrib 322 for name, value in sorted(got.attrib.items()): 323 if name not in want.attrib and not any: 324 attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 325 else: 326 if name in want.attrib: 327 text = self.collect_diff_text(value, want.attrib[name], False) 328 else: 329 text = self.format_text(value, False) 330 attrs.append('%s="%s"' % (name, text)) 331 if not any: 332 for name, value in sorted(want.attrib.items()): 333 if name in got.attrib: 334 continue 335 attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 336 if attrs: 337 tag = '<%s %s>' % (tag, ' '.join(attrs)) 338 else: 339 tag = '<%s>' % tag 340 return tag
341
342 - def collect_diff_end_tag(self, want, got):
343 if want.tag != got.tag: 344 tag = '%s (got: %s)' % (want.tag, got.tag) 345 else: 346 tag = got.tag 347 return '</%s>' % tag
348
349 - def collect_diff_text(self, want, got, strip=True):
350 if self.text_compare(want, got, strip): 351 if not got: 352 return '' 353 return self.format_text(got, strip) 354 text = '%s (got: %s)' % (want, got) 355 return self.format_text(text, strip)
356
357 -class LHTMLOutputChecker(LXMLOutputChecker):
358 - def get_default_parser(self):
359 return html_fromstring
360
361 -def install(html=False):
362 """ 363 Install doctestcompare for all future doctests. 364 365 If html is true, then by default the HTML parser will be used; 366 otherwise the XML parser is used. 367 """ 368 if html: 369 doctest.OutputChecker = LHTMLOutputChecker 370 else: 371 doctest.OutputChecker = LXMLOutputChecker
372
373 -def temp_install(html=False, del_module=None):
374 """ 375 Use this *inside* a doctest to enable this checker for this 376 doctest only. 377 378 If html is true, then by default the HTML parser will be used; 379 otherwise the XML parser is used. 380 """ 381 if html: 382 Checker = LHTMLOutputChecker 383 else: 384 Checker = LXMLOutputChecker 385 frame = _find_doctest_frame() 386 dt_self = frame.f_locals['self'] 387 checker = Checker() 388 old_checker = dt_self._checker 389 dt_self._checker = checker 390 # The unfortunate thing is that there is a local variable 'check' 391 # in the function that runs the doctests, that is a bound method 392 # into the output checker. We have to update that. We can't 393 # modify the frame, so we have to modify the object in place. The 394 # only way to do this is to actually change the func_code 395 # attribute of the method. We change it, and then wait for 396 # __record_outcome to be run, which signals the end of the __run 397 # method, at which point we restore the previous check_output 398 # implementation. 399 check_func = frame.f_locals['check'].im_func 400 # Because we can't patch up func_globals, this is the only global 401 # in check_output that we care about: 402 doctest.etree = etree 403 _RestoreChecker(dt_self, old_checker, checker, 404 check_func, checker.check_output.im_func, 405 del_module)
406
407 -class _RestoreChecker(object):
408 - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 409 del_module):
410 self.dt_self = dt_self 411 self.checker = old_checker 412 self.checker._temp_call_super_check_output = self.call_super 413 self.checker._temp_override_self = new_checker 414 self.check_func = check_func 415 self.clone_func = clone_func 416 self.del_module = del_module 417 self.install_clone() 418 self.install_dt_self()
419 - def install_clone(self):
420 self.func_code = self.check_func.func_code 421 self.func_globals = self.check_func.func_globals 422 self.check_func.func_code = self.clone_func.func_code
423 - def uninstall_clone(self):
424 self.check_func.func_code = self.func_code
425 - def install_dt_self(self):
426 self.prev_func = self.dt_self._DocTestRunner__record_outcome 427 self.dt_self._DocTestRunner__record_outcome = self
428 - def uninstall_dt_self(self):
429 self.dt_self._DocTestRunner__record_outcome = self.prev_func
430 - def uninstall_module(self):
431 if self.del_module: 432 import sys 433 del sys.modules[self.del_module] 434 if '.' in self.del_module: 435 package, module = self.del_module.rsplit('.', 1) 436 package_mod = sys.modules[package] 437 delattr(package_mod, module)
438 - def __call__(self, *args, **kw):
439 self.uninstall_clone() 440 self.uninstall_dt_self() 441 del self.checker._temp_override_self 442 del self.checker._temp_call_super_check_output 443 result = self.prev_func(*args, **kw) 444 self.uninstall_module() 445 return result
446 - def call_super(self, *args, **kw):
447 self.uninstall_clone() 448 try: 449 return self.check_func(*args, **kw) 450 finally: 451 self.install_clone()
452
453 -def _find_doctest_frame():
454 import sys 455 frame = sys._getframe(1) 456 while frame: 457 l = frame.f_locals 458 if 'BOOM' in l: 459 # Sign of doctest 460 return frame 461 frame = frame.f_back 462 raise LookupError( 463 "Could not find doctest (only use this function *inside* a doctest)")
464 465 __test__ = { 466 'basic': ''' 467 >>> temp_install() 468 >>> print """<xml a="1" b="2">stuff</xml>""" 469 <xml b="2" a="1">...</xml> 470 >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>""" 471 <xml xmlns="..."> 472 <tag attr="..." /> 473 </xml> 474 >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 475 <xml>...foo /></xml> 476 '''} 477 478 if __name__ == '__main__': 479 import doctest 480 doctest.testmod() 481