Package lxml :: Module doctestcompare
[hide private]
[frames] | no frames]

Source Code for Module lxml.doctestcompare

  1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  To use this you must call ``lxmldoctest.install()``, which will cause 
  5  doctest to use this in all subsequent calls. 
  6   
  7  This changes the way output is checked and comparisons are made for 
  8  XML or HTML-like content. 
  9   
 10  XML or HTML content is noticed because the example starts with ``<`` 
 11  (it's HTML if it starts with ``<html``).  You can also use the 
 12  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 13   
 14  Some rough wildcard-like things are allowed.  Whitespace is generally 
 15  ignored (except in attributes).  In text (attributes and text in the 
 16  body) you can use ``...`` as a wildcard.  In an example it also 
 17  matches any trailing tags in the element, though it does not match 
 18  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 19  attribute in the tag.  An ``any`` tag matches any tag, while the 
 20  attribute matches any and all attributes. 
 21   
 22  When a match fails, the reformatted example and gotten text is 
 23  displayed (indented), and a rough diff-like output is given.  Anything 
 24  marked with ``-`` is in the output but wasn't supposed to be, and 
 25  similarly ``+`` means its in the example but wasn't in the output. 
 26   
 27  You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` 
 28  """ 
 29   
 30  from lxml import etree 
 31  from lxml.html import document_fromstring 
 32  import re 
 33  import doctest 
 34  import cgi 
 35   
 36  __all__ = ['PARSE_HTML', 'PARSE_XML', 'LXMLOutputChecker', 
 37             'LHTMLOutputChecker', 'install', 'temp_install'] 
 38   
 39  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 40  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 41  NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') 
 42   
 43  OutputChecker = doctest.OutputChecker 
 44   
45 -def strip(v):
46 if v is None: 47 return None 48 else: 49 return v.strip()
50
51 -def norm_whitespace(v):
52 return _norm_whitespace_re.sub(' ', v)
53 54 # We use this to distinguish repr()s from elements: 55 _repr_re = re.compile(r'^<[^>]+ (at|object) ') 56 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 57
58 -class LXMLOutputChecker(OutputChecker):
59 60 empty_tags = ( 61 'param', 'img', 'area', 'br', 'basefont', 'input', 62 'base', 'meta', 'link', 'col') 63
64 - def get_default_parser(self):
65 return etree.XML
66
67 - def check_output(self, want, got, optionflags):
68 alt_self = getattr(self, '_temp_override_self', None) 69 if alt_self is not None: 70 super_method = self._temp_call_super_check_output 71 self = alt_self 72 else: 73 super_method = OutputChecker.check_output 74 parser = self.get_parser(want, got, optionflags) 75 if not parser: 76 return super_method( 77 self, want, got, optionflags) 78 try: 79 want_doc = parser(want) 80 except etree.XMLSyntaxError: 81 return False 82 try: 83 got_doc = parser(got) 84 except etree.XMLSyntaxError: 85 return False 86 return self.compare_docs(want_doc, got_doc)
87
88 - def get_parser(self, want, got, optionflags):
89 parser = None 90 if NOPARSE_MARKUP & optionflags: 91 return None 92 if PARSE_HTML & optionflags: 93 parser = document_fromstring 94 elif PARSE_XML & optionflags: 95 parser = etree.XML 96 elif (want.strip().lower().startswith('<html') 97 and got.strip().startswith('<html')): 98 parser = document_fromstring 99 elif (self._looks_like_markup(want) 100 and self._looks_like_markup(got)): 101 parser = self.get_default_parser() 102 return parser
103
104 - def _looks_like_markup(self, s):
105 s = s.strip() 106 return (s.startswith('<') 107 and not _repr_re.search(s))
108
109 - def compare_docs(self, want, got):
110 if not self.tag_compare(want.tag, got.tag): 111 return False 112 if not self.text_compare(want.text, got.text, True): 113 return False 114 if not self.text_compare(want.tail, got.tail, True): 115 return False 116 if 'any' not in want.attrib: 117 want_keys = sorted(want.attrib.keys()) 118 got_keys = sorted(got.attrib.keys()) 119 if want_keys != got_keys: 120 return False 121 for key in want_keys: 122 if not self.text_compare(want.attrib[key], got.attrib[key], False): 123 return False 124 if want.text != '...' or len(want): 125 want_children = list(want) 126 got_children = list(got) 127 while want_children or got_children: 128 if not want_children or not got_children: 129 return False 130 want_first = want_children.pop(0) 131 got_first = got_children.pop(0) 132 if not self.compare_docs(want_first, got_first): 133 return False 134 if not got_children and want_first.tail == '...': 135 break 136 return True
137
138 - def text_compare(self, want, got, strip):
139 want = want or '' 140 got = got or '' 141 if strip: 142 want = norm_whitespace(want).strip() 143 got = norm_whitespace(got).strip() 144 want = '^%s$' % re.escape(want) 145 want = want.replace(r'\.\.\.', '.*') 146 if re.search(want, got): 147 return True 148 else: 149 return False
150
151 - def tag_compare(self, want, got):
152 if want == 'any': 153 return True 154 if (not isinstance(want, basestring) 155 or not isinstance(got, basestring)): 156 return want == got 157 want = want or '' 158 got = got or '' 159 if want.startswith('{...}'): 160 # Ellipsis on the namespace 161 return want.split('}')[-1] == got.split('}')[-1] 162 else: 163 return want == got
164
165 - def output_difference(self, example, got, optionflags):
166 want = example.want 167 parser = self.get_parser(want, got, optionflags) 168 errors = [] 169 if parser is not None: 170 try: 171 want_doc = parser(want) 172 except etree.XMLSyntaxError, e: 173 errors.append('In example: %s' % e) 174 try: 175 got_doc = parser(got) 176 except etree.XMLSyntaxError, e: 177 errors.append('In actual output: %s' % e) 178 if parser is None or errors: 179 value = OutputChecker.output_difference( 180 self, example, got, optionflags) 181 if errors: 182 errors.append(value) 183 return '\n'.join(errors) 184 else: 185 return value 186 html = parser is document_fromstring 187 diff_parts = [] 188 diff_parts.append('Expected:') 189 diff_parts.append(self.format_doc(want_doc, html, 2)) 190 diff_parts.append('Got:') 191 diff_parts.append(self.format_doc(got_doc, html, 2)) 192 diff_parts.append('Diff:') 193 diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 194 return '\n'.join(diff_parts)
195
196 - def html_empty_tag(self, el, html=True):
197 if not html: 198 return False 199 if el.tag not in self.empty_tags: 200 return False 201 if el.text or len(el): 202 # This shouldn't happen (contents in an empty tag) 203 return False 204 return True
205
206 - def format_doc(self, doc, html, indent, prefix=''):
207 parts = [] 208 if not len(doc): 209 # No children... 210 parts.append(' '*indent) 211 parts.append(prefix) 212 parts.append(self.format_tag(doc)) 213 if not self.html_empty_tag(doc, html): 214 if strip(doc.text): 215 parts.append(self.format_text(doc.text)) 216 parts.append(self.format_end_tag(doc)) 217 if strip(doc.tail): 218 parts.append(self.format_text(doc.tail)) 219 parts.append('\n') 220 return ''.join(parts) 221 parts.append(' '*indent) 222 parts.append(prefix) 223 parts.append(self.format_tag(doc)) 224 if not self.html_empty_tag(doc, html): 225 parts.append('\n') 226 if strip(doc.text): 227 parts.append(' '*indent) 228 parts.append(self.format_text(doc.text)) 229 parts.append('\n') 230 for el in doc: 231 parts.append(self.format_doc(el, html, indent+2)) 232 parts.append(' '*indent) 233 parts.append(self.format_end_tag(doc)) 234 parts.append('\n') 235 if strip(doc.tail): 236 parts.append(' '*indent) 237 parts.append(self.format_text(doc.tail)) 238 parts.append('\n') 239 return ''.join(parts)
240
241 - def format_text(self, text, strip=True):
242 if text is None: 243 return '' 244 if strip: 245 text = text.strip() 246 return cgi.escape(text, 1)
247
248 - def format_tag(self, el):
249 attrs = [] 250 if isinstance(el, etree.CommentBase): 251 # FIXME: probably PIs should be handled specially too? 252 return '<!--' 253 for name, value in sorted(el.attrib.items()): 254 attrs.append('%s="%s"' % (name, self.format_text(value, False))) 255 if not attrs: 256 return '<%s>' % el.tag 257 return '<%s %s>' % (el.tag, ' '.join(attrs))
258
259 - def format_end_tag(self, el):
260 if isinstance(el, etree.CommentBase): 261 # FIXME: probably PIs should be handled specially too? 262 return '-->' 263 return '</%s>' % el.tag
264
265 - def collect_diff(self, want, got, html, indent):
266 parts = [] 267 if not len(want) and not len(got): 268 parts.append(' '*indent) 269 parts.append(self.collect_diff_tag(want, got)) 270 if not self.html_empty_tag(got, html): 271 parts.append(self.collect_diff_text(want.text, got.text)) 272 parts.append(self.collect_diff_end_tag(want, got)) 273 parts.append(self.collect_diff_text(want.tail, got.tail)) 274 parts.append('\n') 275 return ''.join(parts) 276 parts.append(' '*indent) 277 parts.append(self.collect_diff_tag(want, got)) 278 parts.append('\n') 279 if strip(want.text) or strip(got.text): 280 parts.append(' '*indent) 281 parts.append(self.collect_diff_text(want.text, got.text)) 282 parts.append('\n') 283 want_children = list(want) 284 got_children = list(got) 285 while want_children or got_children: 286 if not want_children: 287 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) 288 continue 289 if not got_children: 290 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) 291 continue 292 parts.append(self.collect_diff( 293 want_children.pop(0), got_children.pop(0), html, indent+2)) 294 parts.append(' '*indent) 295 parts.append(self.collect_diff_end_tag(want, got)) 296 parts.append('\n') 297 if strip(want.tail) or strip(got.tail): 298 parts.append(' '*indent) 299 parts.append(self.collect_diff_text(want.tail, got.tail)) 300 parts.append('\n') 301 return ''.join(parts)
302
303 - def collect_diff_tag(self, want, got):
304 if not self.tag_compare(want.tag, got.tag): 305 tag = '%s (got: %s)' % (want.tag, got.tag) 306 else: 307 tag = got.tag 308 attrs = [] 309 any = want.tag == 'any' or 'any' in want.attrib 310 for name, value in sorted(got.attrib.items()): 311 if name not in want.attrib and not any: 312 attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 313 else: 314 if name in want.attrib: 315 text = self.collect_diff_text(value, want.attrib[name], False) 316 else: 317 text = self.format_text(value, False) 318 attrs.append('%s="%s"' % (name, text)) 319 if not any: 320 for name, value in sorted(want.attrib.items()): 321 if name in got.attrib: 322 continue 323 attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 324 if attrs: 325 tag = '<%s %s>' % (tag, ' '.join(attrs)) 326 else: 327 tag = '<%s>' % tag 328 return tag
329
330 - def collect_diff_end_tag(self, want, got):
331 if want.tag != got.tag: 332 tag = '%s (got: %s)' % (want.tag, got.tag) 333 else: 334 tag = got.tag 335 return '</%s>' % tag
336
337 - def collect_diff_text(self, want, got, strip=True):
338 if self.text_compare(want, got, strip): 339 if not got: 340 return '' 341 return self.format_text(got, strip) 342 text = '%s (got: %s)' % (want, got) 343 return self.format_text(text, strip)
344
345 -class LHTMLOutputChecker(LXMLOutputChecker):
346 - def get_default_parser(self):
348
349 -def install(html=False):
350 """ 351 Install doctestcompare for all future doctests. 352 353 If html is true, then by default the HTML parser will be used; 354 otherwise the XML parser is used. 355 """ 356 if html: 357 doctest.OutputChecker = LHTMLOutputChecker 358 else: 359 doctest.OutputChecker = LXMLOutputChecker
360
361 -def temp_install(html=False, del_module=None):
362 """ 363 Use this *inside* a doctest to enable this checker for this 364 doctest only. 365 366 If html is true, then by default the HTML parser will be used; 367 otherwise the XML parser is used. 368 """ 369 if html: 370 Checker = LHTMLOutputChecker 371 else: 372 Checker = LXMLOutputChecker 373 frame = _find_doctest_frame() 374 dt_self = frame.f_locals['self'] 375 checker = Checker() 376 old_checker = dt_self._checker 377 dt_self._checker = checker 378 # The unfortunate thing is that there is a local variable 'check' 379 # in the function that runs the doctests, that is a bound method 380 # into the output checker. We have to update that. We can't 381 # modify the frame, so we have to modify the object in place. The 382 # only way to do this is to actually change the func_code 383 # attribute of the method. We change it, and then wait for 384 # __record_outcome to be run, which signals the end of the __run 385 # method, at which point we restore the previous check_output 386 # implementation. 387 check_func = frame.f_locals['check'].im_func 388 # Because we can't patch up func_globals, this is the only global 389 # in check_output that we care about: 390 doctest.etree = etree 391 _RestoreChecker(dt_self, old_checker, checker, 392 check_func, checker.check_output.im_func, 393 del_module)
394
395 -class _RestoreChecker(object):
396 - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 397 del_module):
398 self.dt_self = dt_self 399 self.checker = old_checker 400 self.checker._temp_call_super_check_output = self.call_super 401 self.checker._temp_override_self = new_checker 402 self.check_func = check_func 403 self.clone_func = clone_func 404 self.del_module = del_module 405 self.install_clone() 406 self.install_dt_self()
407 - def install_clone(self):
408 self.func_code = self.check_func.func_code 409 self.func_globals = self.check_func.func_globals 410 self.check_func.func_code = self.clone_func.func_code
411 - def uninstall_clone(self):
412 self.check_func.func_code = self.func_code
413 - def install_dt_self(self):
414 self.prev_func = self.dt_self._DocTestRunner__record_outcome 415 self.dt_self._DocTestRunner__record_outcome = self
416 - def uninstall_dt_self(self):
417 self.dt_self._DocTestRunner__record_outcome = self.prev_func
418 - def uninstall_module(self):
419 if self.del_module: 420 import sys 421 del sys.modules[self.del_module] 422 if '.' in self.del_module: 423 package, module = self.del_module.rsplit('.', 1) 424 package_mod = sys.modules[package] 425 delattr(package_mod, module)
426 - def __call__(self, *args, **kw):
427 self.uninstall_clone() 428 self.uninstall_dt_self() 429 del self.checker._temp_override_self 430 del self.checker._temp_call_super_check_output 431 result = self.prev_func(*args, **kw) 432 self.uninstall_module() 433 return result
434 - def call_super(self, *args, **kw):
435 self.uninstall_clone() 436 try: 437 return self.check_func(*args, **kw) 438 finally: 439 self.install_clone()
440
441 -def _find_doctest_frame():
442 import sys 443 frame = sys._getframe(1) 444 while frame: 445 l = frame.f_locals 446 if 'BOOM' in l: 447 # Sign of doctest 448 return frame 449 frame = frame.f_back 450 raise LookupError( 451 "Could not find doctest (only use this function *inside* a doctest)")
452 453 __test__ = { 454 'basic': ''' 455 >>> temp_install() 456 >>> print """<xml a="1" b="2">stuff</xml>""" 457 <xml b="2" a="1">...</xml> 458 >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>""" 459 <xml xmlns="..."> 460 <tag attr="..." /> 461 </xml> 462 >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS 463 <xml>...foo /></xml> 464 '''} 465 466 if __name__ == '__main__': 467 import doctest 468 doctest.testmod() 469