Package lxml :: Module doctestcompare
[hide private]
[frames] | no frames]

Source Code for Module lxml.doctestcompare

  1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  To use this you must call ``lxmldoctest.install()``, which will cause 
  5  doctest to use this in all subsequent calls. 
  6   
  7  This changes the way output is checked and comparisons are made for 
  8  XML or HTML-like content. 
  9   
 10  XML or HTML content is noticed because the example starts with ``<`` 
 11  (it's HTML if it starts with ``<html``).  You can also use the 
 12  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 13   
 14  Some rough wildcard-like things are allowed.  Whitespace is generally 
 15  ignored (except in attributes).  In text (attributes and text in the 
 16  body) you can use ``...`` as a wildcard.  In an example it also 
 17  matches any trailing tags in the element, though it does not match 
 18  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 19  attribute in the tag.  An ``any`` tag matches any tag, while the 
 20  attribute matches any and all attributes. 
 21   
 22  When a match fails, the reformatted example and gotten text is 
 23  displayed (indented), and a rough diff-like output is given.  Anything 
 24  marked with ``-`` is in the output but wasn't supposed to be, and 
 25  similarly ``+`` means its in the example but wasn't in the output. 
 26   
 27  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 28  """ 
 29   
 30  from lxml import etree 
 31  import re 
 32  import doctest 
 33  import cgi 
 34   
 35  __all__ = ['PARSE_HTML', 'PARSE_XML', 'LXMLOutputChecker', 
 36             'LHTMLOutputChecker', 'install', 'temp_install'] 
 37   
 38  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 39  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 40  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 41   
 42  OutputChecker = doctest.OutputChecker 
 43   
44 -def strip(v):
45 if v is None: 46 return None 47 else: 48 return v.strip()
49
50 -def norm_whitespace(v):
51 return _norm_whitespace_re.sub(' ', v)
52 53 _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) 54
55 -def html_fromstring(html):
56 return etree.fromstring(html, _html_parser)
57 58 # We use this to distinguish repr()s from elements: 59 _repr_re = re.compile(r'^<[^>]+ (at|object) ') 60 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 61
62 -class LXMLOutputChecker(OutputChecker):
63 64 empty_tags = ( 65 'param', 'img', 'area', 'br', 'basefont', 'input', 66 'base', 'meta', 'link', 'col') 67
68 - def get_default_parser(self):
69 return etree.XML
70
71 - def check_output(self, want, got, optionflags):
72 alt_self = getattr(self, '_temp_override_self', None) 73 if alt_self is not None: 74 super_method = self._temp_call_super_check_output 75 self = alt_self 76 else: 77 super_method = OutputChecker.check_output 78 parser = self.get_parser(want, got, optionflags) 79 if not parser: 80 return super_method( 81 self, want, got, optionflags) 82 try: 83 want_doc = parser(want) 84 except etree.XMLSyntaxError: 85 return False 86 try: 87 got_doc = parser(got) 88 except etree.XMLSyntaxError: 89 return False 90 return self.compare_docs(want_doc, got_doc)
91
92 - def get_parser(self, want, got, optionflags):
93 parser = None 94 if NOPARSE_MARKUP & optionflags: 95 return None 96 if PARSE_HTML & optionflags: 97 parser = html_fromstring 98 elif PARSE_XML & optionflags: 99 parser = etree.XML 100 elif (want.strip().lower().startswith('<html') 101 and got.strip().startswith('<html')): 102 parser = html_fromstring 103 elif (self._looks_like_markup(want) 104 and self._looks_like_markup(got)): 105 parser = self.get_default_parser() 106 return parser
107
108 - def _looks_like_markup(self, s):
109 s = s.strip() 110 return (s.startswith('<') 111 and not _repr_re.search(s))
112
113 - def compare_docs(self, want, got):
114 if not self.tag_compare(want.tag, got.tag): 115 return False 116 if not self.text_compare(want.text, got.text, True): 117 return False 118 if not self.text_compare(want.tail, got.tail, True): 119 return False 120 if 'any' not in want.attrib: 121 want_keys = sorted(want.attrib.keys()) 122 got_keys = sorted(got.attrib.keys()) 123 if want_keys != got_keys: 124 return False 125 for key in want_keys: 126 if not self.text_compare(want.attrib[key], got.attrib[key], False): 127 return False 128 if want.text != '...' or len(want): 129 want_children = list(want) 130 got_children = list(got) 131 while want_children or got_children: 132 if not want_children or not got_children: 133 return False 134 want_first = want_children.pop(0) 135 got_first = got_children.pop(0) 136 if not self.compare_docs(want_first, got_first): 137 return False 138 if not got_children and want_first.tail == '...': 139 break 140 return True
141
142 - def text_compare(self, want, got, strip):
143 want = want or '' 144 got = got or '' 145 if strip: 146 want = norm_whitespace(want).strip() 147 got = norm_whitespace(got).strip() 148 want = '^%s$' % re.escape(want) 149 want = want.replace(r'\.\.\.', '.*') 150 if re.search(want, got): 151 return True 152 else: 153 return False
154
155 - def tag_compare(self, want, got):
156 if want == 'any': 157 return True 158 if (not isinstance(want, basestring) 159 or not isinstance(got, basestring)): 160 return want == got 161 want = want or '' 162 got = got or '' 163 if want.startswith('{...}'): 164 # Ellipsis on the namespace 165 return want.split('}')[-1] == got.split('}')[-1] 166 else: 167 return want == got
168
169 - def output_difference(self, example, got, optionflags):
170 want = example.want 171 parser = self.get_parser(want, got, optionflags) 172 errors = [] 173 if parser is not None: 174 try: 175 want_doc = parser(want) 176 except etree.XMLSyntaxError, e: 177 errors.append('In example: %s' % e) 178 try: 179 got_doc = parser(got) 180 except etree.XMLSyntaxError, e: 181 errors.append('In actual output: %s' % e) 182 if parser is None or errors: 183 value = OutputChecker.output_difference( 184 self, example, got, optionflags) 185 if errors: 186 errors.append(value) 187 return '\n'.join(errors) 188 else: 189 return value 190 html = parser is html_fromstring 191 diff_parts = [] 192 diff_parts.append('Expected:') 193 diff_parts.append(self.format_doc(want_doc, html, 2)) 194 diff_parts.append('Got:') 195 diff_parts.append(self.format_doc(got_doc, html, 2)) 196 diff_parts.append('Diff:') 197 diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 198 return '\n'.join(diff_parts)
199
200 - def html_empty_tag(self, el, html=True):
201 if not html: 202 return False 203 if el.tag not in self.empty_tags: 204 return False 205 if el.text or len(el): 206 # This shouldn't happen (contents in an empty tag) 207 return False 208 return True
209
210 - def format_doc(self, doc, html, indent, prefix=''):
211 parts = [] 212 if not len(doc): 213 # No children... 214 parts.append(' '*indent) 215 parts.append(prefix) 216 parts.append(self.format_tag(doc)) 217 if not self.html_empty_tag(doc, html): 218 if strip(doc.text): 219 parts.append(self.format_text(doc.text)) 220 parts.append(self.format_end_tag(doc)) 221 if strip(doc.tail): 222 parts.append(self.format_text(doc.tail)) 223 parts.append('\n') 224 return ''.join(parts) 225 parts.append(' '*indent) 226 parts.append(prefix) 227 parts.append(self.format_tag(doc)) 228 if not self.html_empty_tag(doc, html): 229 parts.append('\n') 230 if strip(doc.text): 231 parts.append(' '*indent) 232 parts.append(self.format_text(doc.text)) 233 parts.append('\n') 234 for el in doc: 235 parts.append(self.format_doc(el, html, indent+2)) 236 parts.append(' '*indent) 237 parts.append(self.format_end_tag(doc)) 238 parts.append('\n') 239 if strip(doc.tail): 240 parts.append(' '*indent) 241 parts.append(self.format_text(doc.tail)) 242 parts.append('\n') 243 return ''.join(parts)
244
245 - def format_text(self, text, strip=True):
246 if text is None: 247 return '' 248 if strip: 249 text = text.strip() 250 return cgi.escape(text, 1)
251
252 - def format_tag(self, el):
253 attrs = [] 254 if isinstance(el, etree.CommentBase): 255 # FIXME: probably PIs should be handled specially too? 256 return '<!--' 257 for name, value in sorted(el.attrib.items()): 258 attrs.append('%s="%s"' % (name, self.format_text(value, False))) 259 if not attrs: 260 return '<%s>' % el.tag 261 return '<%s %s>' % (el.tag, ' '.join(attrs))
262
263 - def format_end_tag(self, el):
264 if isinstance(el, etree.CommentBase): 265 # FIXME: probably PIs should be handled specially too? 266 return '-->' 267 return '</%s>' % el.tag
268
269 - def collect_diff(self, want, got, html, indent):
270 parts = [] 271 if not len(want) and not len(got): 272 parts.append(' '*indent) 273 parts.append(self.collect_diff_tag(want, got)) 274 if not self.html_empty_tag(got, html): 275 parts.append(self.collect_diff_text(want.text, got.text)) 276 parts.append(self.collect_diff_end_tag(want, got)) 277 parts.append(self.collect_diff_text(want.tail, got.tail)) 278 parts.append('\n') 279 return ''.join(parts) 280 parts.append(' '*indent) 281 parts.append(self.collect_diff_tag(want, got)) 282 parts.append('\n') 283 if strip(want.text) or strip(got.text): 284 parts.append(' '*indent) 285 parts.append(self.collect_diff_text(want.text, got.text)) 286 parts.append('\n') 287 want_children = list(want) 288 got_children = list(got) 289 while want_children or got_children: 290 if not want_children: 291 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) 292 continue 293 if not got_children: 294 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) 295 continue 296 parts.append(self.collect_diff( 297 want_children.pop(0), got_children.pop(0), html, indent+2)) 298 parts.append(' '*indent) 299 parts.append(self.collect_diff_end_tag(want, got)) 300 parts.append('\n') 301 if strip(want.tail) or strip(got.tail): 302 parts.append(' '*indent) 303 parts.append(self.collect_diff_text(want.tail, got.tail)) 304 parts.append('\n') 305 return ''.join(parts)
306
307 - def collect_diff_tag(self, want, got):
308 if not self.tag_compare(want.tag, got.tag): 309 tag = '%s (got: %s)' % (want.tag, got.tag) 310 else: 311 tag = got.tag 312 attrs = [] 313 any = want.tag == 'any' or 'any' in want.attrib 314 for name, value in sorted(got.attrib.items()): 315 if name not in want.attrib and not any: 316 attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 317 else: 318 if name in want.attrib: 319 text = self.collect_diff_text(value, want.attrib[name], False) 320 else: 321 text = self.format_text(value, False) 322 attrs.append('%s="%s"' % (name, text)) 323 if not any: 324 for name, value in sorted(want.attrib.items()): 325 if name in got.attrib: 326 continue 327 attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 328 if attrs: 329 tag = '<%s %s>' % (tag, ' '.join(attrs)) 330 else: 331 tag = '<%s>' % tag 332 return tag
333
334 - def collect_diff_end_tag(self, want, got):
335 if want.tag != got.tag: 336 tag = '%s (got: %s)' % (want.tag, got.tag) 337 else: 338 tag = got.tag 339 return '</%s>' % tag
340
341 - def collect_diff_text(self, want, got, strip=True):
342 if self.text_compare(want, got, strip): 343 if not got: 344 return '' 345 return self.format_text(got, strip) 346 text = '%s (got: %s)' % (want, got) 347 return self.format_text(text, strip)
348
349 -class LHTMLOutputChecker(LXMLOutputChecker):
350 - def get_default_parser(self):
351 return html_fromstring
352
353 -def install(html=False):
354 """ 355 Install doctestcompare for all future doctests. 356 357 If html is true, then by default the HTML parser will be used; 358 otherwise the XML parser is used. 359 """ 360 if html: 361 doctest.OutputChecker = LHTMLOutputChecker 362 else: 363 doctest.OutputChecker = LXMLOutputChecker
364
365 -def temp_install(html=False, del_module=None):
366 """ 367 Use this *inside* a doctest to enable this checker for this 368 doctest only. 369 370 If html is true, then by default the HTML parser will be used; 371 otherwise the XML parser is used. 372 """ 373 if html: 374 Checker = LHTMLOutputChecker 375 else: 376 Checker = LXMLOutputChecker 377 frame = _find_doctest_frame() 378 dt_self = frame.f_locals['self'] 379 checker = Checker() 380 old_checker = dt_self._checker 381 dt_self._checker = checker 382 # The unfortunate thing is that there is a local variable 'check' 383 # in the function that runs the doctests, that is a bound method 384 # into the output checker. We have to update that. We can't 385 # modify the frame, so we have to modify the object in place. The 386 # only way to do this is to actually change the func_code 387 # attribute of the method. We change it, and then wait for 388 # __record_outcome to be run, which signals the end of the __run 389 # method, at which point we restore the previous check_output 390 # implementation. 391 check_func = frame.f_locals['check'].im_func 392 # Because we can't patch up func_globals, this is the only global 393 # in check_output that we care about: 394 doctest.etree = etree 395 _RestoreChecker(dt_self, old_checker, checker, 396 check_func, checker.check_output.im_func, 397 del_module)
398
399 -class _RestoreChecker(object):
400 - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 401 del_module):
402 self.dt_self = dt_self 403 self.checker = old_checker 404 self.checker._temp_call_super_check_output = self.call_super 405 self.checker._temp_override_self = new_checker 406 self.check_func = check_func 407 self.clone_func = clone_func 408 self.del_module = del_module 409 self.install_clone() 410 self.install_dt_self()
411 - def install_clone(self):
412 self.func_code = self.check_func.func_code 413 self.func_globals = self.check_func.func_globals 414 self.check_func.func_code = self.clone_func.func_code
415 - def uninstall_clone(self):
416 self.check_func.func_code = self.func_code
417 - def install_dt_self(self):
418 self.prev_func = self.dt_self._DocTestRunner__record_outcome 419 self.dt_self._DocTestRunner__record_outcome = self
420 - def uninstall_dt_self(self):
421 self.dt_self._DocTestRunner__record_outcome = self.prev_func
422 - def uninstall_module(self):
423 if self.del_module: 424 import sys 425 del sys.modules[self.del_module] 426 if '.' in self.del_module: 427 package, module = self.del_module.rsplit('.', 1) 428 package_mod = sys.modules[package] 429 delattr(package_mod, module)
430 - def __call__(self, *args, **kw):
431 self.uninstall_clone() 432 self.uninstall_dt_self() 433 del self.checker._temp_override_self 434 del self.checker._temp_call_super_check_output 435 result = self.prev_func(*args, **kw) 436 self.uninstall_module() 437 return result
438 - def call_super(self, *args, **kw):
439 self.uninstall_clone() 440 try: 441 return self.check_func(*args, **kw) 442 finally: 443 self.install_clone()
444
445 -def _find_doctest_frame():
446 import sys 447 frame = sys._getframe(1) 448 while frame: 449 l = frame.f_locals 450 if 'BOOM' in l: 451 # Sign of doctest 452 return frame 453 frame = frame.f_back 454 raise LookupError( 455 "Could not find doctest (only use this function *inside* a doctest)")
456 457 __test__ = { 458 'basic': ''' 459 >>> temp_install() 460 >>> print """<xml a="1" b="2">stuff</xml>""" 461 <xml b="2" a="1">...</xml> 462 >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>""" 463 <xml xmlns="..."> 464 <tag attr="..." /> 465 </xml> 466 >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 467 <xml>...foo /></xml> 468 '''} 469 470 if __name__ == '__main__': 471 import doctest 472 doctest.testmod() 473