Package lxml :: Package isoschematron
[hide private]
[frames] | no frames]

Source Code for Package lxml.isoschematron

  1  """The ``lxml.isoschematron`` package implements ISO Schematron support on top 
  2  of the pure-xslt 'skeleton' implementation. 
  3  """ 
  4   
  5  import sys 
  6  import os.path 
  7  from lxml import etree as _etree # due to validator __init__ signature 
  8   
  9   
 10  # some compat stuff, borrowed from lxml.html 
 11  try: 
 12      bytes = __builtins__["bytes"] 
 13  except (KeyError, NameError): 
 14      # Python < 2.6 
 15      bytes = str 
 16  try: 
 17      unicode = __builtins__["unicode"] 
 18  except (KeyError, NameError): 
 19      # Python 3 
 20      unicode = str 
 21  try: 
 22      basestring = __builtins__["basestring"] 
 23  except (KeyError, NameError): 
 24      # Python 3 
 25      basestring = str 
 26   
 27   
 28  __all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include', 
 29             'iso_abstract_expand', 'iso_svrl_for_xslt1', 
 30             'svrl_validation_errors', 'schematron_schema_valid', 
 31             'stylesheet_params', 'Schematron']  
 32   
 33   
 34  # some namespaces 
 35  #FIXME: Maybe lxml should provide a dedicated place for common namespace 
 36  #FIXME: definitions? 
 37  XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" 
 38  RELAXNG_NS = "http://relaxng.org/ns/structure/1.0" 
 39  SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron" 
 40  SVRL_NS = "http://purl.oclc.org/dsdl/svrl" 
 41   
 42   
 43  # some helpers 
 44  _schematron_root = '{%s}schema' % SCHEMATRON_NS 
 45  _xml_schema_root = '{%s}schema' % XML_SCHEMA_NS 
 46  _resources_dir = os.path.join(os.path.dirname(__file__), 'resources') 
 47   
 48   
 49  # the iso-schematron skeleton implementation steps aka xsl transformations 
 50  extract_xsd = _etree.XSLT(_etree.parse( 
 51      os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl'))) 
 52  extract_rng = _etree.XSLT(_etree.parse( 
 53      os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl'))) 
 54  iso_dsdl_include = _etree.XSLT(_etree.parse( 
 55      os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1', 
 56                   'iso_dsdl_include.xsl'))) 
 57  iso_abstract_expand = _etree.XSLT(_etree.parse( 
 58      os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1', 
 59                   'iso_abstract_expand.xsl'))) 
 60  iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse( 
 61      os.path.join(_resources_dir, 
 62                   'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl'))) 
 63   
 64   
 65  # svrl result accessors 
 66  svrl_validation_errors = _etree.XPath( 
 67      '//svrl:failed-assert', namespaces={'svrl': SVRL_NS}) 
 68   
 69   
 70  # RelaxNG validator for schematron schemas 
 71  schematron_schema_valid = _etree.RelaxNG(_etree.parse( 
 72      os.path.join(_resources_dir, 'rng', 'iso-schematron.rng'))) 
 73   
 74   
75 -def stylesheet_params(**kwargs):
76 """Convert keyword args to a dictionary of stylesheet parameters. 77 XSL stylesheet parameters must be XPath expressions, i.e.: 78 * string expressions, like "'5'" 79 * simple (number) expressions, like "5" 80 * valid XPath expressions, like "/a/b/text()" 81 This function converts native Python keyword arguments to stylesheet 82 parameters following these rules: 83 If an arg is a string wrap it with XSLT.strparam(). 84 If an arg is an XPath object use its path string. 85 If arg is None raise TypeError. 86 Else convert arg to string. 87 """ 88 result = {} 89 for key, val in kwargs.items(): 90 if isinstance(val, basestring): 91 val = _etree.XSLT.strparam(val) 92 elif val is None: 93 raise TypeError('None not allowed as a stylesheet parameter') 94 elif not isinstance(val, _etree.XPath): 95 val = unicode(val) 96 result[key] = val 97 return result
98 99 100 # helper function for use in Schematron __init__
101 -def _stylesheet_param_dict(paramsDict, kwargsDict):
102 """Return a copy of paramsDict, updated with kwargsDict entries, wrapped as 103 stylesheet arguments. 104 kwargsDict entries with a value of None are ignored. 105 """ 106 # beware of changing mutable default arg 107 paramsDict = dict(paramsDict) 108 for k, v in kwargsDict.items(): 109 if v is not None: # None values do not override 110 paramsDict[k] = v 111 paramsDict = stylesheet_params(**paramsDict) 112 return paramsDict
113 114
115 -class Schematron(_etree._Validator):
116 """An ISO Schematron validator. 117 118 Pass a root Element or an ElementTree to turn it into a validator. 119 Alternatively, pass a filename as keyword argument 'file' to parse from 120 the file system. 121 Built on the Schematron language 'reference' skeleton pure-xslt 122 implementation, the validator is created as an XSLT 1.0 stylesheet using 123 these steps: 124 125 0) (Extract from XML Schema or RelaxNG schema) 126 1) Process inclusions 127 2) Process abstract patterns 128 3) Compile the schematron schema to XSLT 129 130 The ``include`` and ``expand`` keyword arguments can be used to switch off 131 steps 1) and 2). 132 To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the 133 keyword arguments ``include_params``, ``expand_params`` or 134 ``compile_params``. 135 For convenience, the compile-step parameter ``phase`` is also exposed as a 136 keyword argument ``phase``. This takes precedence if the parameter is also 137 given in the parameter dictionary. 138 If ``store_schematron`` is set to True, the (included-and-expanded) 139 schematron document tree is stored and available through the ``schematron`` 140 property. 141 If ``store_xslt`` is set to True, the validation XSLT document tree will be 142 stored and can be retrieved through the ``validator_xslt`` property. 143 With ``store_report`` set to True (default: False), the resulting validation 144 report document gets stored and can be accessed as the ``validation_report`` 145 property. 146 147 Schematron is a less well known, but very powerful schema language. The main 148 idea is to use the capabilities of XPath to put restrictions on the structure 149 and the content of XML documents. Here is a simple example:: 150 151 >>> from lxml import isoschematron 152 >>> schematron = isoschematron.Schematron(etree.XML(''' 153 ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" > 154 ... <pattern id="id_only_attribute"> 155 ... <title>id is the only permitted attribute name</title> 156 ... <rule context="*"> 157 ... <report test="@*[not(name()='id')]">Attribute 158 ... <name path="@*[not(name()='id')]"/> is forbidden<name/> 159 ... </report> 160 ... </rule> 161 ... </pattern> 162 ... </schema> 163 ... ''')) 164 165 >>> xml = etree.XML(''' 166 ... <AAA name="aaa"> 167 ... <BBB id="bbb"/> 168 ... <CCC color="ccc"/> 169 ... </AAA> 170 ... ''') 171 172 >>> schematron.validate(xml) 173 0 174 175 >>> xml = etree.XML(''' 176 ... <AAA id="aaa"> 177 ... <BBB id="bbb"/> 178 ... <CCC/> 179 ... </AAA> 180 ... ''') 181 182 >>> schematron.validate(xml) 183 1 184 """ 185 186 # libxml2 error categorization for validation errors 187 _domain = _etree.ErrorDomains.SCHEMATRONV 188 _level = _etree.ErrorLevels.ERROR 189 _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT 190
191 - def _extract(self, element):
192 """Extract embedded schematron schema from non-schematron host schema. 193 This method will only be called by __init__ if the given schema document 194 is not a schematron schema by itself. 195 Must return a schematron schema document tree or None. 196 """ 197 schematron = None 198 if element.tag == _xml_schema_root: 199 schematron = self._extract_xsd(element) 200 elif element.nsmap[element.prefix] == RELAXNG_NS: 201 # RelaxNG does not have a single unique root element 202 schematron = self._extract_rng(element) 203 return schematron
204 205 # customization points 206 # etree.XSLT objects that provide the extract, include, expand, compile 207 # steps 208 _extract_xsd = extract_xsd 209 _extract_rng = extract_rng 210 _include = iso_dsdl_include 211 _expand = iso_abstract_expand 212 _compile = iso_svrl_for_xslt1 213 # etree.XPath object that determines input document validity when applied to 214 # the svrl result report; must return a list of result elements (empty if 215 # valid) 216 _validation_errors = svrl_validation_errors 217
218 - def __init__(self, etree=None, file=None, include=True, expand=True, 219 include_params={}, expand_params={}, compile_params={}, 220 store_schematron=False, store_xslt=False, store_report=False, 221 phase=None):
222 super(Schematron, self).__init__() 223 224 self._store_report = store_report 225 self._schematron = None 226 self._validator_xslt = None 227 self._validation_report = None 228 229 # parse schema document, may be a schematron schema or an XML Schema or 230 # a RelaxNG schema with embedded schematron rules 231 try: 232 if etree is not None: 233 if isinstance(etree, _etree._Element): 234 root = etree 235 else: 236 root = etree.getroot() 237 elif file is not None: 238 root = _etree.parse(file).getroot() 239 except Exception: 240 raise _etree.SchematronParseError( 241 "No tree or file given: %s" % sys.exc_info()[1]) 242 if root is None: 243 raise ValueError("Empty tree") 244 if root.tag == _schematron_root: 245 schematron = root 246 else: 247 schematron = self._extract(root) 248 if schematron is None: 249 raise _etree.SchematronParseError( 250 "Document is not a schematron schema or schematron-extractable") 251 # perform the iso-schematron skeleton implementation steps to get a 252 # validating xslt 253 if include: 254 schematron = self._include(schematron, **include_params) 255 if expand: 256 schematron = self._expand(schematron, **expand_params) 257 if not schematron_schema_valid(schematron): 258 raise _etree.SchematronParseError( 259 "invalid schematron schema: %s" % 260 schematron_schema_valid.error_log) 261 if store_schematron: 262 self._schematron = schematron 263 # add new compile keyword args here if exposing them 264 compile_kwargs = {'phase': phase} 265 compile_params = _stylesheet_param_dict(compile_params, compile_kwargs) 266 validator_xslt = self._compile(schematron, **compile_params) 267 if store_xslt: 268 self._validator_xslt = validator_xslt 269 self._validator = _etree.XSLT(validator_xslt)
270
271 - def __call__(self, etree):
272 """Validate doc using Schematron. 273 274 Returns true if document is valid, false if not. 275 """ 276 self._clear_error_log() 277 result = self._validator(etree) 278 if self._store_report: 279 self._validation_report = result 280 errors = self._validation_errors(result) 281 if errors: 282 if isinstance(etree, _etree._Element): 283 fname = etree.getroottree().docinfo.URL or '<file>' 284 else: 285 fname = etree.docinfo.URL or '<file>' 286 for error in errors: 287 # Does svrl report the line number, anywhere? Don't think so. 288 self._append_log_message( 289 domain=self._domain, type=self._error_type, 290 level=self._level, line=0, message=_etree.tounicode(error), 291 filename=fname) 292 return False 293 return True
294
295 - def schematron(self):
296 """ISO-schematron schema document (None if object has been initialized 297 with store_schematron=False). 298 """ 299 return self._schematron
300 schematron = property(schematron, doc=schematron.__doc__) 301
302 - def validator_xslt(self):
303 """ISO-schematron skeleton implementation XSLT validator document (None 304 if object has been initialized with store_xslt=False). 305 """ 306 return self._validator_xslt
307 validator_xslt = property(validator_xslt, doc=validator_xslt.__doc__) 308
309 - def validation_report(self):
310 """ISO-schematron validation result report (None if result-storing has 311 been turned off). 312 """ 313 return self._validation_report
314 validation_report = property(validation_report, doc=validation_report.__doc__)
315