Package lxml :: Package isoschematron
[hide private]
[frames] | no frames]

Source Code for Package lxml.isoschematron

  1  """The ``lxml.isoschematron`` package implements ISO Schematron support on top 
  2  of the pure-xslt 'skeleton' implementation. 
  3  """ 
  4   
  5  import sys 
  6  import os.path 
  7  from lxml import etree as _etree # due to validator __init__ signature 
  8   
  9   
 10  # some compat stuff, borrowed from lxml.html 
 11  try: 
 12      unicode 
 13  except NameError: 
 14      # Python 3 
 15      unicode = str 
 16  try: 
 17      basestring 
 18  except NameError: 
 19      # Python 3 
 20      basestring = str 
 21   
 22   
 23  __all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include', 
 24             'iso_abstract_expand', 'iso_svrl_for_xslt1', 
 25             'svrl_validation_errors', 'schematron_schema_valid', 
 26             'stylesheet_params', 'Schematron'] 
 27   
 28   
 29  # some namespaces 
 30  #FIXME: Maybe lxml should provide a dedicated place for common namespace 
 31  #FIXME: definitions? 
 32  XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" 
 33  RELAXNG_NS = "http://relaxng.org/ns/structure/1.0" 
 34  SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron" 
 35  SVRL_NS = "http://purl.oclc.org/dsdl/svrl" 
 36   
 37   
 38  # some helpers 
 39  _schematron_root = '{%s}schema' % SCHEMATRON_NS 
 40  _xml_schema_root = '{%s}schema' % XML_SCHEMA_NS 
 41  _resources_dir = os.path.join(os.path.dirname(__file__), 'resources') 
 42   
 43   
 44  # the iso-schematron skeleton implementation steps aka xsl transformations 
 45  extract_xsd = _etree.XSLT(_etree.parse( 
 46      os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl'))) 
 47  extract_rng = _etree.XSLT(_etree.parse( 
 48      os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl'))) 
 49  iso_dsdl_include = _etree.XSLT(_etree.parse( 
 50      os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1', 
 51                   'iso_dsdl_include.xsl'))) 
 52  iso_abstract_expand = _etree.XSLT(_etree.parse( 
 53      os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1', 
 54                   'iso_abstract_expand.xsl'))) 
 55  iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse( 
 56      os.path.join(_resources_dir, 
 57                   'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl'))) 
 58   
 59   
 60  # svrl result accessors 
 61  svrl_validation_errors = _etree.XPath( 
 62      '//svrl:failed-assert', namespaces={'svrl': SVRL_NS}) 
 63   
 64   
 65  # RelaxNG validator for schematron schemas 
 66  schematron_schema_valid = _etree.RelaxNG( 
 67      file=os.path.join(_resources_dir, 'rng', 'iso-schematron.rng')) 
68 69 70 -def stylesheet_params(**kwargs):
71 """Convert keyword args to a dictionary of stylesheet parameters. 72 XSL stylesheet parameters must be XPath expressions, i.e.: 73 74 * string expressions, like "'5'" 75 * simple (number) expressions, like "5" 76 * valid XPath expressions, like "/a/b/text()" 77 78 This function converts native Python keyword arguments to stylesheet 79 parameters following these rules: 80 If an arg is a string wrap it with XSLT.strparam(). 81 If an arg is an XPath object use its path string. 82 If arg is None raise TypeError. 83 Else convert arg to string. 84 """ 85 result = {} 86 for key, val in kwargs.items(): 87 if isinstance(val, basestring): 88 val = _etree.XSLT.strparam(val) 89 elif val is None: 90 raise TypeError('None not allowed as a stylesheet parameter') 91 elif not isinstance(val, _etree.XPath): 92 val = unicode(val) 93 result[key] = val 94 return result
95
96 97 # helper function for use in Schematron __init__ 98 -def _stylesheet_param_dict(paramsDict, kwargsDict):
99 """Return a copy of paramsDict, updated with kwargsDict entries, wrapped as 100 stylesheet arguments. 101 kwargsDict entries with a value of None are ignored. 102 """ 103 # beware of changing mutable default arg 104 paramsDict = dict(paramsDict) 105 for k, v in kwargsDict.items(): 106 if v is not None: # None values do not override 107 paramsDict[k] = v 108 paramsDict = stylesheet_params(**paramsDict) 109 return paramsDict
110
111 112 -class Schematron(_etree._Validator):
113 """An ISO Schematron validator. 114 115 Pass a root Element or an ElementTree to turn it into a validator. 116 Alternatively, pass a filename as keyword argument 'file' to parse from 117 the file system. 118 119 Schematron is a less well known, but very powerful schema language. 120 The main idea is to use the capabilities of XPath to put restrictions on 121 the structure and the content of XML documents. 122 123 The standard behaviour is to fail on ``failed-assert`` findings only 124 (``ASSERTS_ONLY``). To change this, you can either pass a report filter 125 function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS`` 126 or a custom ``XPath`` object), or subclass isoschematron.Schematron for 127 complete control of the validation process. 128 129 Built on the Schematron language 'reference' skeleton pure-xslt 130 implementation, the validator is created as an XSLT 1.0 stylesheet using 131 these steps: 132 133 0) (Extract from XML Schema or RelaxNG schema) 134 1) Process inclusions 135 2) Process abstract patterns 136 3) Compile the schematron schema to XSLT 137 138 The ``include`` and ``expand`` keyword arguments can be used to switch off 139 steps 1) and 2). 140 To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the 141 keyword arguments ``include_params``, ``expand_params`` or 142 ``compile_params``. 143 For convenience, the compile-step parameter ``phase`` is also exposed as a 144 keyword argument ``phase``. This takes precedence if the parameter is also 145 given in the parameter dictionary. 146 147 If ``store_schematron`` is set to True, the (included-and-expanded) 148 schematron document tree is stored and available through the ``schematron`` 149 property. 150 If ``store_xslt`` is set to True, the validation XSLT document tree will be 151 stored and can be retrieved through the ``validator_xslt`` property. 152 With ``store_report`` set to True (default: False), the resulting validation 153 report document gets stored and can be accessed as the ``validation_report`` 154 property. 155 156 Here is a usage example:: 157 158 >>> from lxml import etree 159 >>> from lxml.isoschematron import Schematron 160 161 >>> schematron = Schematron(etree.XML(''' 162 ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" > 163 ... <pattern id="id_only_attribute"> 164 ... <title>id is the only permitted attribute name</title> 165 ... <rule context="*"> 166 ... <report test="@*[not(name()='id')]">Attribute 167 ... <name path="@*[not(name()='id')]"/> is forbidden<name/> 168 ... </report> 169 ... </rule> 170 ... </pattern> 171 ... </schema>'''), 172 ... error_finder=Schematron.ASSERTS_AND_REPORTS) 173 174 >>> xml = etree.XML(''' 175 ... <AAA name="aaa"> 176 ... <BBB id="bbb"/> 177 ... <CCC color="ccc"/> 178 ... </AAA> 179 ... ''') 180 181 >>> schematron.validate(xml) 182 False 183 184 >>> xml = etree.XML(''' 185 ... <AAA id="aaa"> 186 ... <BBB id="bbb"/> 187 ... <CCC/> 188 ... </AAA> 189 ... ''') 190 191 >>> schematron.validate(xml) 192 True 193 """ 194 195 # libxml2 error categorization for validation errors 196 _domain = _etree.ErrorDomains.SCHEMATRONV 197 _level = _etree.ErrorLevels.ERROR 198 _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT 199 200 # convenience definitions for common behaviours 201 ASSERTS_ONLY = svrl_validation_errors # Default 202 ASSERTS_AND_REPORTS = _etree.XPath( 203 '//svrl:failed-assert | //svrl:successful-report', 204 namespaces={'svrl': SVRL_NS}) 205
206 - def _extract(self, element):
207 """Extract embedded schematron schema from non-schematron host schema. 208 This method will only be called by __init__ if the given schema document 209 is not a schematron schema by itself. 210 Must return a schematron schema document tree or None. 211 """ 212 schematron = None 213 if element.tag == _xml_schema_root: 214 schematron = self._extract_xsd(element) 215 elif element.nsmap[element.prefix] == RELAXNG_NS: 216 # RelaxNG does not have a single unique root element 217 schematron = self._extract_rng(element) 218 return schematron
219 220 # customization points 221 # etree.XSLT objects that provide the extract, include, expand, compile 222 # steps 223 _extract_xsd = extract_xsd 224 _extract_rng = extract_rng 225 _include = iso_dsdl_include 226 _expand = iso_abstract_expand 227 _compile = iso_svrl_for_xslt1 228 229 # etree.xpath object that determines input document validity when applied to 230 # the svrl result report; must return a list of result elements (empty if 231 # valid) 232 _validation_errors = ASSERTS_ONLY 233
234 - def __init__(self, etree=None, file=None, include=True, expand=True, 235 include_params={}, expand_params={}, compile_params={}, 236 store_schematron=False, store_xslt=False, store_report=False, 237 phase=None, error_finder=ASSERTS_ONLY):
238 super(Schematron, self).__init__() 239 240 self._store_report = store_report 241 self._schematron = None 242 self._validator_xslt = None 243 self._validation_report = None 244 if error_finder is not self.ASSERTS_ONLY: 245 self._validation_errors = error_finder 246 247 # parse schema document, may be a schematron schema or an XML Schema or 248 # a RelaxNG schema with embedded schematron rules 249 root = None 250 try: 251 if etree is not None: 252 if _etree.iselement(etree): 253 root = etree 254 else: 255 root = etree.getroot() 256 elif file is not None: 257 root = _etree.parse(file).getroot() 258 except Exception: 259 raise _etree.SchematronParseError( 260 "No tree or file given: %s" % sys.exc_info()[1]) 261 if root is None: 262 raise ValueError("Empty tree") 263 if root.tag == _schematron_root: 264 schematron = root 265 else: 266 schematron = self._extract(root) 267 if schematron is None: 268 raise _etree.SchematronParseError( 269 "Document is not a schematron schema or schematron-extractable") 270 # perform the iso-schematron skeleton implementation steps to get a 271 # validating xslt 272 if include: 273 schematron = self._include(schematron, **include_params) 274 if expand: 275 schematron = self._expand(schematron, **expand_params) 276 if not schematron_schema_valid(schematron): 277 raise _etree.SchematronParseError( 278 "invalid schematron schema: %s" % 279 schematron_schema_valid.error_log) 280 if store_schematron: 281 self._schematron = schematron 282 # add new compile keyword args here if exposing them 283 compile_kwargs = {'phase': phase} 284 compile_params = _stylesheet_param_dict(compile_params, compile_kwargs) 285 validator_xslt = self._compile(schematron, **compile_params) 286 if store_xslt: 287 self._validator_xslt = validator_xslt 288 self._validator = _etree.XSLT(validator_xslt)
289
290 - def __call__(self, etree):
291 """Validate doc using Schematron. 292 293 Returns true if document is valid, false if not. 294 """ 295 self._clear_error_log() 296 result = self._validator(etree) 297 if self._store_report: 298 self._validation_report = result 299 errors = self._validation_errors(result) 300 if errors: 301 if _etree.iselement(etree): 302 fname = etree.getroottree().docinfo.URL or '<file>' 303 else: 304 fname = etree.docinfo.URL or '<file>' 305 for error in errors: 306 # Does svrl report the line number, anywhere? Don't think so. 307 self._append_log_message( 308 domain=self._domain, type=self._error_type, 309 level=self._level, line=0, 310 message=_etree.tostring(error, encoding='unicode'), 311 filename=fname) 312 return False 313 return True
314 315 @property
316 - def schematron(self):
317 """ISO-schematron schema document (None if object has been initialized 318 with store_schematron=False). 319 """ 320 return self._schematron
321 322 @property
323 - def validator_xslt(self):
324 """ISO-schematron skeleton implementation XSLT validator document (None 325 if object has been initialized with store_xslt=False). 326 """ 327 return self._validator_xslt
328 329 @property
330 - def validation_report(self):
331 """ISO-schematron validation result report (None if result-storing has 332 been turned off). 333 """ 334 return self._validation_report
335