lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 import sys 35 import re 36 try: 37 from urlparse import urljoin 38 except ImportError: 39 # Python 3 40 from urllib.parse import urljoin 41 import copy 42 from lxml import etree 43 from lxml.html import defs 44 from lxml.html._setmixin import SetMixin 45 try: 46 from collections import MutableMapping as DictMixin 47 except ImportError: 48 # Python < 2.6 49 from UserDict import DictMixin 50 try: 51 set 52 except NameError: 53 # Python 2.3 54 from sets import Set as set 55 try: 56 bytes 57 except NameError: 58 # Python < 2.6 59 bytes = str 60 try: 61 unicode 62 except NameError: 63 # Python 3 64 unicode = str 65 try: 66 basestring 67 except NameError: 68 # Python 3 69 basestring = (str, bytes) 70

71 -def __fix_docstring(s):

72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)

80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer 100 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104 _parse_meta_refresh_url = re.compile( 105 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search 106 107

108 -def _unquote_match(s, pos):

109 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 110 return s[1:-1], pos+1 111 else: 112 return s,pos

113

114 -def _transform_result(typ, result):

115 """Convert the result back into the input type. 116 """ 117 if issubclass(typ, bytes): 118 return tostring(result, encoding='utf-8') 119 elif issubclass(typ, unicode): 120 return tostring(result, encoding='unicode') 121 else: 122 return result

123

124 -def _nons(tag):

125 if isinstance(tag, basestring): 126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 127 return tag.split('}')[-1] 128 return tag

129

130 -class HtmlMixin(object):

131

132 - def base_url(self):

133 """ 134 Returns the base URL, given when the page was parsed. 135 136 Use with ``urlparse.urljoin(el.base_url, href)`` to get 137 absolute URLs. 138 """ 139 return self.getroottree().docinfo.URL

140 base_url = property(base_url, doc=base_url.__doc__) 141

142 - def forms(self):

143 """ 144 Return a list of all the forms 145 """ 146 return _forms_xpath(self)

147 forms = property(forms, doc=forms.__doc__) 148

149 - def body(self):

150 """ 151 Return the <body> element. Can be called from a child element 152 to get the document's head. 153 """ 154 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

155 body = property(body, doc=body.__doc__) 156

157 - def head(self):

158 """ 159 Returns the <head> element. Can be called from a child 160 element to get the document's head. 161 """ 162 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

163 head = property(head, doc=head.__doc__) 164

165 - def _label__get(self):

166 """ 167 Get or set any <label> element associated with this element. 168 """ 169 id = self.get('id') 170 if not id: 171 return None 172 result = _label_xpath(self, id=id) 173 if not result: 174 return None 175 else: 176 return result[0]

177 - def _label__set(self, label):

178 id = self.get('id') 179 if not id: 180 raise TypeError( 181 "You cannot set a label for an element (%r) that has no id" 182 % self) 183 if _nons(label.tag) != 'label': 184 raise TypeError( 185 "You can only assign label to a label element (not %r)" 186 % label) 187 label.set('for', id)

188 - def _label__del(self):

189 label = self.label 190 if label is not None: 191 del label.attrib['for']

192 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 193

194 - def drop_tree(self):

195 """ 196 Removes this element from the tree, including its children and 197 text. The tail text is joined to the previous element or 198 parent. 199 """ 200 parent = self.getparent() 201 assert parent is not None 202 if self.tail: 203 previous = self.getprevious() 204 if previous is None: 205 parent.text = (parent.text or '') + self.tail 206 else: 207 previous.tail = (previous.tail or '') + self.tail 208 parent.remove(self)

209

210 - def drop_tag(self):

211 """ 212 Remove the tag, but not its children or text. The children and text 213 are merged into the parent. 214 215 Example:: 216 217 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 218 >>> h.find('.//b').drop_tag() 219 >>> print(tostring(h, encoding='unicode')) 220 <div>Hello World!</div> 221 """ 222 parent = self.getparent() 223 assert parent is not None 224 previous = self.getprevious() 225 if self.text and isinstance(self.tag, basestring): 226 # not a Comment, etc. 227 if previous is None: 228 parent.text = (parent.text or '') + self.text 229 else: 230 previous.tail = (previous.tail or '') + self.text 231 if self.tail: 232 if len(self): 233 last = self[-1] 234 last.tail = (last.tail or '') + self.tail 235 elif previous is None: 236 parent.text = (parent.text or '') + self.tail 237 else: 238 previous.tail = (previous.tail or '') + self.tail 239 index = parent.index(self) 240 parent[index:index+1] = self[:]

241

242 - def find_rel_links(self, rel):

243 """ 244 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 245 """ 246 rel = rel.lower() 247 return [el for el in _rel_links_xpath(self) 248 if el.get('rel').lower() == rel]

249

250 - def find_class(self, class_name):

251 """ 252 Find any elements with the given class name. 253 """ 254 return _class_xpath(self, class_name=class_name)

255

256 - def get_element_by_id(self, id, *default):

257 """ 258 Get the first element in a document with the given id. If none is 259 found, return the default argument if provided or raise KeyError 260 otherwise. 261 262 Note that there can be more than one element with the same id, 263 and this isn't uncommon in HTML documents found in the wild. 264 Browsers return only the first match, and this function does 265 the same. 266 """ 267 try: 268 # FIXME: should this check for multiple matches? 269 # browsers just return the first one 270 return _id_xpath(self, id=id)[0] 271 except IndexError: 272 if default: 273 return default[0] 274 else: 275 raise KeyError(id)

276

277 - def text_content(self):

278 """ 279 Return the text content of the tag (and the text in any children). 280 """ 281 return _collect_string_content(self)

282

283 - def cssselect(self, expr, translator='html'):

284 """ 285 Run the CSS expression on this element and its children, 286 returning a list of the results. 287 288 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 289 -- note that pre-compiling the expression can provide a substantial 290 speedup. 291 """ 292 # Do the import here to make the dependency optional. 293 from lxml.cssselect import CSSSelector 294 return CSSSelector(expr, translator=translator)(self)

295 296 ######################################## 297 ## Link functions 298 ######################################## 299

300 - def make_links_absolute(self, base_url=None, resolve_base_href=True, 301 handle_failures=None):

302 """ 303 Make all links in the document absolute, given the 304 ``base_url`` for the document (the full URL where the document 305 came from), or if no ``base_url`` is given, then the ``.base_url`` 306 of the document. 307 308 If ``resolve_base_href`` is true, then any ``<base href>`` 309 tags in the document are used *and* removed from the document. 310 If it is false then any such tag is ignored. 311 312 If ``handle_failures`` is None (default), a failure to process 313 a URL will abort the processing. If set to 'ignore', errors 314 are ignored. If set to 'discard', failing URLs will be removed. 315 """ 316 if base_url is None: 317 base_url = self.base_url 318 if base_url is None: 319 raise TypeError( 320 "No base_url given, and the document has no base_url") 321 if resolve_base_href: 322 self.resolve_base_href() 323 324 if handle_failures == 'ignore': 325 def link_repl(href): 326 try: 327 return urljoin(base_url, href) 328 except ValueError: 329 return href

330 elif handle_failures == 'discard': 331 def link_repl(href): 332 try: 333 return urljoin(base_url, href) 334 except ValueError: 335 return None

336 elif handle_failures is None: 337 def link_repl(href): 338 return urljoin(base_url, href) 339 else: 340 raise ValueError( 341 "unexpected value for handle_failures: %r" % handle_failures) 342 343 self.rewrite_links(link_repl) 344

345 - def resolve_base_href(self, handle_failures=None):

346 """ 347 Find any ``<base href>`` tag in the document, and apply its 348 values to all links found in the document. Also remove the 349 tag once it has been applied. 350 351 If ``handle_failures`` is None (default), a failure to process 352 a URL will abort the processing. If set to 'ignore', errors 353 are ignored. If set to 'discard', failing URLs will be removed. 354 """ 355 base_href = None 356 basetags = self.xpath('//base[@href]|//x:base[@href]', 357 namespaces={'x': XHTML_NAMESPACE}) 358 for b in basetags: 359 base_href = b.get('href') 360 b.drop_tree() 361 if not base_href: 362 return 363 self.make_links_absolute(base_href, resolve_base_href=False, 364 handle_failures=handle_failures)

365

366 - def iterlinks(self):

367 """ 368 Yield (element, attribute, link, pos), where attribute may be None 369 (indicating the link is in the text). ``pos`` is the position 370 where the link occurs; often 0, but sometimes something else in 371 the case of links in stylesheets or style tags. 372 373 Note: <base href> is *not* taken into account in any way. The 374 link you get is exactly the link in the document. 375 376 Note: multiple links inside of a single text string or 377 attribute value are returned in reversed order. This makes it 378 possible to replace or delete them from the text string value 379 based on their reported text positions. Otherwise, a 380 modification at one text position can change the positions of 381 links reported later on. 382 """ 383 link_attrs = defs.link_attrs 384 for el in self.iter(etree.Element): 385 attribs = el.attrib 386 tag = _nons(el.tag) 387 if tag == 'object': 388 codebase = None 389 ## <object> tags have attributes that are relative to 390 ## codebase 391 if 'codebase' in attribs: 392 codebase = el.get('codebase') 393 yield (el, 'codebase', codebase, 0) 394 for attrib in ('classid', 'data'): 395 if attrib in attribs: 396 value = el.get(attrib) 397 if codebase is not None: 398 value = urljoin(codebase, value) 399 yield (el, attrib, value, 0) 400 if 'archive' in attribs: 401 for match in _archive_re.finditer(el.get('archive')): 402 value = match.group(0) 403 if codebase is not None: 404 value = urljoin(codebase, value) 405 yield (el, 'archive', value, match.start()) 406 else: 407 for attrib in link_attrs: 408 if attrib in attribs: 409 yield (el, attrib, attribs[attrib], 0) 410 if tag == 'meta': 411 http_equiv = attribs.get('http-equiv', '').lower() 412 if http_equiv == 'refresh': 413 content = attribs.get('content', '') 414 match = _parse_meta_refresh_url(content) 415 url = (match.group('url') if match else content).strip() 416 # unexpected content means the redirect won't work, but we might 417 # as well be permissive and return the entire string. 418 if url: 419 url, pos = _unquote_match( 420 url, match.start('url') if match else content.find(url)) 421 yield (el, 'content', url, pos) 422 elif tag == 'param': 423 valuetype = el.get('valuetype') or '' 424 if valuetype.lower() == 'ref': 425 ## FIXME: while it's fine we *find* this link, 426 ## according to the spec we aren't supposed to 427 ## actually change the value, including resolving 428 ## it. It can also still be a link, even if it 429 ## doesn't have a valuetype="ref" (which seems to be the norm) 430 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 431 yield (el, 'value', el.get('value'), 0) 432 elif tag == 'style' and el.text: 433 urls = [ 434 # (start_pos, url) 435 _unquote_match(match.group(1), match.start(1))[::-1] 436 for match in _iter_css_urls(el.text) 437 ] + [ 438 (match.start(1), match.group(1)) 439 for match in _iter_css_imports(el.text) 440 ] 441 if urls: 442 # sort by start pos to bring both match sets back into order 443 # and reverse the list to report correct positions despite 444 # modifications 445 urls.sort(reverse=True) 446 for start, url in urls: 447 yield (el, None, url, start) 448 if 'style' in attribs: 449 urls = list(_iter_css_urls(attribs['style'])) 450 if urls: 451 # return in reversed order to simplify in-place modifications 452 for match in urls[::-1]: 453 url, start = _unquote_match(match.group(1), match.start(1)) 454 yield (el, 'style', url, start)

455

456 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 457 base_href=None):

458 """ 459 Rewrite all the links in the document. For each link 460 ``link_repl_func(link)`` will be called, and the return value 461 will replace the old link. 462 463 Note that links may not be absolute (unless you first called 464 ``make_links_absolute()``), and may be internal (e.g., 465 ``'#anchor'``). They can also be values like 466 ``'mailto:email'`` or ``'javascript:expr'``. 467 468 If you give ``base_href`` then all links passed to 469 ``link_repl_func()`` will take that into account. 470 471 If the ``link_repl_func`` returns None, the attribute or 472 tag text will be removed completely. 473 """ 474 if base_href is not None: 475 # FIXME: this can be done in one pass with a wrapper 476 # around link_repl_func 477 self.make_links_absolute( 478 base_href, resolve_base_href=resolve_base_href) 479 elif resolve_base_href: 480 self.resolve_base_href() 481 482 for el, attrib, link, pos in self.iterlinks(): 483 new_link = link_repl_func(link.strip()) 484 if new_link == link: 485 continue 486 if new_link is None: 487 # Remove the attribute or element content 488 if attrib is None: 489 el.text = '' 490 else: 491 del el.attrib[attrib] 492 continue 493 494 if attrib is None: 495 new = el.text[:pos] + new_link + el.text[pos+len(link):] 496 el.text = new 497 else: 498 cur = el.get(attrib) 499 if not pos and len(cur) == len(link): 500 new = new_link # most common case 501 else: 502 new = cur[:pos] + new_link + cur[pos+len(link):] 503 el.set(attrib, new)

504 505

506 -class _MethodFunc(object):

507 """ 508 An object that represents a method on an element as a function; 509 the function takes either an element or an HTML string. It 510 returns whatever the function normally returns, or if the function 511 works in-place (and so returns None) it returns a serialized form 512 of the resulting document. 513 """

514 - def __init__(self, name, copy=False, source_class=HtmlMixin):

515 self.name = name 516 self.copy = copy 517 self.__doc__ = getattr(source_class, self.name).__doc__

518 - def __call__(self, doc, *args, **kw):

519 result_type = type(doc) 520 if isinstance(doc, basestring): 521 if 'copy' in kw: 522 raise TypeError( 523 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 524 doc = fromstring(doc, **kw) 525 else: 526 if 'copy' in kw: 527 make_a_copy = kw.pop('copy') 528 else: 529 make_a_copy = self.copy 530 if make_a_copy: 531 doc = copy.deepcopy(doc) 532 meth = getattr(doc, self.name) 533 result = meth(*args, **kw) 534 # FIXME: this None test is a bit sloppy 535 if result is None: 536 # Then return what we got in 537 return _transform_result(result_type, doc) 538 else: 539 return result

540 541 find_rel_links = _MethodFunc('find_rel_links', copy=False) 542 find_class = _MethodFunc('find_class', copy=False) 543 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 544 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 545 iterlinks = _MethodFunc('iterlinks', copy=False) 546 rewrite_links = _MethodFunc('rewrite_links', copy=True) 547

548 -class HtmlComment(etree.CommentBase, HtmlMixin):

549 pass

550

551 -class HtmlElement(etree.ElementBase, HtmlMixin):

552 pass

553

554 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

555 pass

556

557 -class HtmlEntity(etree.EntityBase, HtmlMixin):

558 pass

559 560

561 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

562 """A lookup scheme for HTML Element classes. 563 564 To create a lookup instance with different Element classes, pass a tag 565 name mapping of Element classes in the ``classes`` keyword argument and/or 566 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 567 The special key '*' denotes a Mixin class that should be mixed into all 568 Element classes. 569 """ 570 _default_element_classes = {} 571

572 - def __init__(self, classes=None, mixins=None):

573 etree.CustomElementClassLookup.__init__(self) 574 if classes is None: 575 classes = self._default_element_classes.copy() 576 if mixins: 577 mixers = {} 578 for name, value in mixins: 579 if name == '*': 580 for n in classes.keys(): 581 mixers.setdefault(n, []).append(value) 582 else: 583 mixers.setdefault(name, []).append(value) 584 for name, mix_bases in mixers.items(): 585 cur = classes.get(name, HtmlElement) 586 bases = tuple(mix_bases + [cur]) 587 classes[name] = type(cur.__name__, bases, {}) 588 self._element_classes = classes

589

590 - def lookup(self, node_type, document, namespace, name):

591 if node_type == 'element': 592 return self._element_classes.get(name.lower(), HtmlElement) 593 elif node_type == 'comment': 594 return HtmlComment 595 elif node_type == 'PI': 596 return HtmlProcessingInstruction 597 elif node_type == 'entity': 598 return HtmlEntity 599 # Otherwise normal lookup 600 return None

601 602 ################################################################################ 603 # parsing 604 ################################################################################ 605 606 _looks_like_full_html_unicode = re.compile( 607 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 608 _looks_like_full_html_bytes = re.compile( 609 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 610

611 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

612 if parser is None: 613 parser = html_parser 614 value = etree.fromstring(html, parser, **kw) 615 if value is None: 616 raise etree.ParserError( 617 "Document is empty") 618 if ensure_head_body and value.find('head') is None: 619 value.insert(0, Element('head')) 620 if ensure_head_body and value.find('body') is None: 621 value.append(Element('body')) 622 return value

623

624 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 625 parser=None, **kw):

626 """ 627 Parses several HTML elements, returning a list of elements. 628 629 The first item in the list may be a string (though leading 630 whitespace is removed). If no_leading_text is true, then it will 631 be an error if there is leading text, and it will always be a list 632 of only elements. 633 634 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 635 """ 636 if parser is None: 637 parser = html_parser 638 # FIXME: check what happens when you give html with a body, head, etc. 639 if isinstance(html, bytes): 640 if not _looks_like_full_html_bytes(html): 641 # can't use %-formatting in early Py3 versions 642 html = ('<html><body>'.encode('ascii') + html + 643 '</body></html>'.encode('ascii')) 644 else: 645 if not _looks_like_full_html_unicode(html): 646 html = '<html><body>%s</body></html>' % html 647 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 648 assert _nons(doc.tag) == 'html' 649 bodies = [e for e in doc if _nons(e.tag) == 'body'] 650 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 651 body = bodies[0] 652 elements = [] 653 if no_leading_text and body.text and body.text.strip(): 654 raise etree.ParserError( 655 "There is leading text: %r" % body.text) 656 if body.text and body.text.strip(): 657 elements.append(body.text) 658 elements.extend(body) 659 # FIXME: removing the reference to the parent artificial document 660 # would be nice 661 return elements

662

663 -def fragment_fromstring(html, create_parent=False, base_url=None, 664 parser=None, **kw):

665 """ 666 Parses a single HTML element; it is an error if there is more than 667 one element, or if anything but whitespace precedes or follows the 668 element. 669 670 If ``create_parent`` is true (or is a tag name) then a parent node 671 will be created to encapsulate the HTML in a single element. In this 672 case, leading or trailing text is also allowed, as are multiple elements 673 as result of the parsing. 674 675 Passing a ``base_url`` will set the document's ``base_url`` attribute 676 (and the tree's docinfo.URL). 677 """ 678 if parser is None: 679 parser = html_parser 680 681 accept_leading_text = bool(create_parent) 682 683 elements = fragments_fromstring( 684 html, parser=parser, no_leading_text=not accept_leading_text, 685 base_url=base_url, **kw) 686 687 if create_parent: 688 if not isinstance(create_parent, basestring): 689 create_parent = 'div' 690 new_root = Element(create_parent) 691 if elements: 692 if isinstance(elements[0], basestring): 693 new_root.text = elements[0] 694 del elements[0] 695 new_root.extend(elements) 696 return new_root 697 698 if not elements: 699 raise etree.ParserError('No elements found') 700 if len(elements) > 1: 701 raise etree.ParserError( 702 "Multiple elements found (%s)" 703 % ', '.join([_element_name(e) for e in elements])) 704 el = elements[0] 705 if el.tail and el.tail.strip(): 706 raise etree.ParserError( 707 "Element followed by text: %r" % el.tail) 708 el.tail = None 709 return el

710

711 -def fromstring(html, base_url=None, parser=None, **kw):

712 """ 713 Parse the html, returning a single element/document. 714 715 This tries to minimally parse the chunk of text, without knowing if it 716 is a fragment or a document. 717 718 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 719 """ 720 if parser is None: 721 parser = html_parser 722 if isinstance(html, bytes): 723 is_full_html = _looks_like_full_html_bytes(html) 724 else: 725 is_full_html = _looks_like_full_html_unicode(html) 726 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 727 if is_full_html: 728 return doc 729 # otherwise, lets parse it out... 730 bodies = doc.findall('body') 731 if not bodies: 732 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 733 if bodies: 734 body = bodies[0] 735 if len(bodies) > 1: 736 # Somehow there are multiple bodies, which is bad, but just 737 # smash them into one body 738 for other_body in bodies[1:]: 739 if other_body.text: 740 if len(body): 741 body[-1].tail = (body[-1].tail or '') + other_body.text 742 else: 743 body.text = (body.text or '') + other_body.text 744 body.extend(other_body) 745 # We'll ignore tail 746 # I guess we are ignoring attributes too 747 other_body.drop_tree() 748 else: 749 body = None 750 heads = doc.findall('head') 751 if not heads: 752 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 753 if heads: 754 # Well, we have some sort of structure, so lets keep it all 755 head = heads[0] 756 if len(heads) > 1: 757 for other_head in heads[1:]: 758 head.extend(other_head) 759 # We don't care about text or tail in a head 760 other_head.drop_tree() 761 return doc 762 if body is None: 763 return doc 764 if (len(body) == 1 and (not body.text or not body.text.strip()) 765 and (not body[-1].tail or not body[-1].tail.strip())): 766 # The body has just one element, so it was probably a single 767 # element passed in 768 return body[0] 769 # Now we have a body which represents a bunch of tags which have the 770 # content that was passed in. We will create a fake container, which 771 # is the body tag, except <body> implies too much structure. 772 if _contains_block_level_tag(body): 773 body.tag = 'div' 774 else: 775 body.tag = 'span' 776 return body

777

778 -def parse(filename_or_url, parser=None, base_url=None, **kw):

779 """ 780 Parse a filename, URL, or file-like object into an HTML document 781 tree. Note: this returns a tree, not an element. Use 782 ``parse(...).getroot()`` to get the document root. 783 784 You can override the base URL with the ``base_url`` keyword. This 785 is most useful when parsing from a file-like object. 786 """ 787 if parser is None: 788 parser = html_parser 789 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

790

791 -def _contains_block_level_tag(el):

792 # FIXME: I could do this with XPath, but would that just be 793 # unnecessarily slow? 794 for el in el.iter(etree.Element): 795 if _nons(el.tag) in defs.block_tags: 796 return True 797 return False

798

799 -def _element_name(el):

800 if isinstance(el, etree.CommentBase): 801 return 'comment' 802 elif isinstance(el, basestring): 803 return 'string' 804 else: 805 return _nons(el.tag)

806 807 ################################################################################ 808 # form handling 809 ################################################################################ 810

811 -class FormElement(HtmlElement):

812 """ 813 Represents a <form> element. 814 """ 815

816 - def inputs(self):

817 """ 818 Returns an accessor for all the input elements in the form. 819 820 See `InputGetter` for more information about the object. 821 """ 822 return InputGetter(self)

823 inputs = property(inputs, doc=inputs.__doc__) 824

825 - def _fields__get(self):

826 """ 827 Dictionary-like object that represents all the fields in this 828 form. You can set values in this dictionary to effect the 829 form. 830 """ 831 return FieldsDict(self.inputs)

832 - def _fields__set(self, value):

833 prev_keys = self.fields.keys() 834 for key, value in value.items(): 835 if key in prev_keys: 836 prev_keys.remove(key) 837 self.fields[key] = value 838 for key in prev_keys: 839 if key is None: 840 # Case of an unnamed input; these aren't really 841 # expressed in form_values() anyway. 842 continue 843 self.fields[key] = None

844 845 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 846

847 - def _name(self):

848 if self.get('name'): 849 return self.get('name') 850 elif self.get('id'): 851 return '#' + self.get('id') 852 forms = list(self.body.iter('form')) 853 if not forms: 854 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 855 return str(forms.index(self))

856

857 - def form_values(self):

858 """ 859 Return a list of tuples of the field values for the form. 860 This is suitable to be passed to ``urllib.urlencode()``. 861 """ 862 results = [] 863 for el in self.inputs: 864 name = el.name 865 if not name: 866 continue 867 tag = _nons(el.tag) 868 if tag == 'textarea': 869 results.append((name, el.value)) 870 elif tag == 'select': 871 value = el.value 872 if el.multiple: 873 for v in value: 874 results.append((name, v)) 875 elif value is not None: 876 results.append((name, el.value)) 877 else: 878 assert tag == 'input', ( 879 "Unexpected tag: %r" % el) 880 if el.checkable and not el.checked: 881 continue 882 if el.type in ('submit', 'image', 'reset'): 883 continue 884 value = el.value 885 if value is not None: 886 results.append((name, el.value)) 887 return results

888

889 - def _action__get(self):

890 """ 891 Get/set the form's ``action`` attribute. 892 """ 893 base_url = self.base_url 894 action = self.get('action') 895 if base_url and action is not None: 896 return urljoin(base_url, action) 897 else: 898 return action

899 - def _action__set(self, value):

900 self.set('action', value)

901 - def _action__del(self):

902 if 'action' in self.attrib: 903 del self.attrib['action']

904 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 905

906 - def _method__get(self):

907 """ 908 Get/set the form's method. Always returns a capitalized 909 string, and defaults to ``'GET'`` 910 """ 911 return self.get('method', 'GET').upper()

912 - def _method__set(self, value):

913 self.set('method', value.upper())

914 method = property(_method__get, _method__set, doc=_method__get.__doc__)

915 916 HtmlElementClassLookup._default_element_classes['form'] = FormElement 917

918 -def submit_form(form, extra_values=None, open_http=None):

919 """ 920 Helper function to submit a form. Returns a file-like object, as from 921 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 922 which shows the URL if there were any redirects. 923 924 You can use this like:: 925 926 form = doc.forms[0] 927 form.inputs['foo'].value = 'bar' # etc 928 response = form.submit() 929 doc = parse(response) 930 doc.make_links_absolute(response.geturl()) 931 932 To change the HTTP requester, pass a function as ``open_http`` keyword 933 argument that opens the URL for you. The function must have the following 934 signature:: 935 936 open_http(method, URL, values) 937 938 The action is one of 'GET' or 'POST', the URL is the target URL as a 939 string, and the values are a sequence of ``(name, value)`` tuples with the 940 form data. 941 """ 942 values = form.form_values() 943 if extra_values: 944 if hasattr(extra_values, 'items'): 945 extra_values = extra_values.items() 946 values.extend(extra_values) 947 if open_http is None: 948 open_http = open_http_urllib 949 if form.action: 950 url = form.action 951 else: 952 url = form.base_url 953 return open_http(form.method, url, values)

954

955 -def open_http_urllib(method, url, values):

956 if not url: 957 raise ValueError("cannot submit, no URL provided") 958 ## FIXME: should test that it's not a relative URL or something 959 try: 960 from urllib import urlencode, urlopen 961 except ImportError: # Python 3 962 from urllib.request import urlopen 963 from urllib.parse import urlencode 964 if method == 'GET': 965 if '?' in url: 966 url += '&' 967 else: 968 url += '?' 969 url += urlencode(values) 970 data = None 971 else: 972 data = urlencode(values) 973 return urlopen(url, data)

974

975 -class FieldsDict(DictMixin):

976

977 - def __init__(self, inputs):

978 self.inputs = inputs

979 - def __getitem__(self, item):

980 return self.inputs[item].value

981 - def __setitem__(self, item, value):

982 self.inputs[item].value = value

983 - def __delitem__(self, item):

984 raise KeyError( 985 "You cannot remove keys from ElementDict")

986 - def keys(self):

987 return self.inputs.keys()

988 - def __contains__(self, item):

989 return item in self.inputs

990 - def __iter__(self):

991 return iter(self.inputs.keys())

992 - def __len__(self):

993 return len(self.inputs)

994

995 - def __repr__(self):

996 return '<%s for form %s>' % ( 997 self.__class__.__name__, 998 self.inputs.form._name())

999

1000 -class InputGetter(object):

1001 1002 """ 1003 An accessor that represents all the input fields in a form. 1004 1005 You can get fields by name from this, with 1006 ``form.inputs['field_name']``. If there are a set of checkboxes 1007 with the same name, they are returned as a list (a `CheckboxGroup` 1008 which also allows value setting). Radio inputs are handled 1009 similarly. 1010 1011 You can also iterate over this to get all input elements. This 1012 won't return the same thing as if you get all the names, as 1013 checkboxes and radio elements are returned individually. 1014 """ 1015 1016 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1017 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1018

1019 - def __init__(self, form):

1020 self.form = form

1021

1022 - def __repr__(self):

1023 return '<%s for form %s>' % ( 1024 self.__class__.__name__, 1025 self.form._name())

1026 1027 ## FIXME: there should be more methods, and it's unclear if this is 1028 ## a dictionary-like object or list-like object 1029

1030 - def __getitem__(self, name):

1031 results = self._name_xpath(self.form, name=name) 1032 if results: 1033 type = results[0].get('type') 1034 if type == 'radio' and len(results) > 1: 1035 group = RadioGroup(results) 1036 group.name = name 1037 return group 1038 elif type == 'checkbox' and len(results) > 1: 1039 group = CheckboxGroup(results) 1040 group.name = name 1041 return group 1042 else: 1043 # I don't like throwing away elements like this 1044 return results[0] 1045 else: 1046 raise KeyError( 1047 "No input element with the name %r" % name)

1048

1049 - def __contains__(self, name):

1050 results = self._name_xpath(self.form, name=name) 1051 return bool(results)

1052

1053 - def keys(self):

1054 names = set() 1055 for el in self: 1056 names.add(el.name) 1057 if None in names: 1058 names.remove(None) 1059 return list(names)

1060

1061 - def __iter__(self):

1062 ## FIXME: kind of dumb to turn a list into an iterator, only 1063 ## to have it likely turned back into a list again :( 1064 return iter(self._all_xpath(self.form))

1065

1066 -class InputMixin(object):

1067 1068 """ 1069 Mix-in for all input elements (input, select, and textarea) 1070 """ 1071 1072

1073 - def _name__get(self):

1074 """ 1075 Get/set the name of the element 1076 """ 1077 return self.get('name')

1078 - def _name__set(self, value):

1079 self.set('name', value)

1080 - def _name__del(self):

1081 if 'name' in self.attrib: 1082 del self.attrib['name']

1083 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1084

1085 - def __repr__(self):

1086 type = getattr(self, 'type', None) 1087 if type: 1088 type = ' type=%r' % type 1089 else: 1090 type = '' 1091 return '<%s %x name=%r%s>' % ( 1092 self.__class__.__name__, id(self), self.name, type)

1093

1094 -class TextareaElement(InputMixin, HtmlElement):

1095 """ 1096 ``<textarea>`` element. You can get the name with ``.name`` and 1097 get/set the value with ``.value`` 1098 """ 1099

1100 - def _value__get(self):

1101 """ 1102 Get/set the value (which is the contents of this element) 1103 """ 1104 content = self.text or '' 1105 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1106 serialisation_method = 'xml' 1107 else: 1108 serialisation_method = 'html' 1109 for el in self: 1110 # it's rare that we actually get here, so let's not use ''.join() 1111 content += etree.tostring( 1112 el, method=serialisation_method, encoding='unicode') 1113 return content

1114 - def _value__set(self, value):

1115 del self[:] 1116 self.text = value

1117 - def _value__del(self):

1118 self.text = '' 1119 del self[:]

1120 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

1121 1122 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1123

1124 -class SelectElement(InputMixin, HtmlElement):

1125 """ 1126 ``<select>`` element. You can get the name with ``.name``. 1127 1128 ``.value`` will be the value of the selected option, unless this 1129 is a multi-select element (``<select multiple>``), in which case 1130 it will be a set-like object. In either case ``.value_options`` 1131 gives the possible values. 1132 1133 The boolean attribute ``.multiple`` shows if this is a 1134 multi-select. 1135 """ 1136

1137 - def _value__get(self):

1138 """ 1139 Get/set the value of this select (the selected option). 1140 1141 If this is a multi-select, this is a set-like object that 1142 represents all the selected options. 1143 """ 1144 if self.multiple: 1145 return MultipleSelectOptions(self) 1146 for el in _options_xpath(self): 1147 if el.get('selected') is not None: 1148 value = el.get('value') 1149 if value is None: 1150 value = el.text or '' 1151 if value: 1152 value = value.strip() 1153 return value 1154 return None

1155

1156 - def _value__set(self, value):

1157 if self.multiple: 1158 if isinstance(value, basestring): 1159 raise TypeError( 1160 "You must pass in a sequence") 1161 self.value.clear() 1162 self.value.update(value) 1163 return 1164 if value is not None: 1165 value = value.strip() 1166 for el in _options_xpath(self): 1167 opt_value = el.get('value') 1168 if opt_value is None: 1169 opt_value = el.text or '' 1170 if opt_value: 1171 opt_value = opt_value.strip() 1172 if opt_value == value: 1173 checked_option = el 1174 break 1175 else: 1176 raise ValueError( 1177 "There is no option with the value of %r" % value) 1178 for el in _options_xpath(self): 1179 if 'selected' in el.attrib: 1180 del el.attrib['selected'] 1181 if value is not None: 1182 checked_option.set('selected', '')

1183

1184 - def _value__del(self):

1185 # FIXME: should del be allowed at all? 1186 if self.multiple: 1187 self.value.clear() 1188 else: 1189 self.value = None

1190 1191 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1192

1193 - def value_options(self):

1194 """ 1195 All the possible values this select can have (the ``value`` 1196 attribute of all the ``<option>`` elements. 1197 """ 1198 options = [] 1199 for el in _options_xpath(self): 1200 value = el.get('value') 1201 if value is None: 1202 value = el.text or '' 1203 if value: 1204 value = value.strip() 1205 options.append(value) 1206 return options

1207 value_options = property(value_options, doc=value_options.__doc__) 1208

1209 - def _multiple__get(self):

1210 """ 1211 Boolean attribute: is there a ``multiple`` attribute on this element. 1212 """ 1213 return 'multiple' in self.attrib

1214 - def _multiple__set(self, value):

1215 if value: 1216 self.set('multiple', '') 1217 elif 'multiple' in self.attrib: 1218 del self.attrib['multiple']

1219 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1220 1221 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1222

1223 -class MultipleSelectOptions(SetMixin):

1224 """ 1225 Represents all the selected options in a ``<select multiple>`` element. 1226 1227 You can add to this set-like option to select an option, or remove 1228 to unselect the option. 1229 """ 1230

1231 - def __init__(self, select):

1232 self.select = select

1233

1234 - def options(self):

1235 """ 1236 Iterator of all the ``<option>`` elements. 1237 """ 1238 return iter(_options_xpath(self.select))

1239 options = property(options) 1240

1241 - def __iter__(self):

1242 for option in self.options: 1243 if 'selected' in option.attrib: 1244 opt_value = option.get('value') 1245 if opt_value is None: 1246 opt_value = option.text or '' 1247 if opt_value: 1248 opt_value = opt_value.strip() 1249 yield opt_value

1250

1251 - def add(self, item):

1252 for option in self.options: 1253 opt_value = option.get('value') 1254 if opt_value is None: 1255 opt_value = option.text or '' 1256 if opt_value: 1257 opt_value = opt_value.strip() 1258 if opt_value == item: 1259 option.set('selected', '') 1260 break 1261 else: 1262 raise ValueError( 1263 "There is no option with the value %r" % item)

1264

1265 - def remove(self, item):

1266 for option in self.options: 1267 opt_value = option.get('value') 1268 if opt_value is None: 1269 opt_value = option.text or '' 1270 if opt_value: 1271 opt_value = opt_value.strip() 1272 if opt_value == item: 1273 if 'selected' in option.attrib: 1274 del option.attrib['selected'] 1275 else: 1276 raise ValueError( 1277 "The option %r is not currently selected" % item) 1278 break 1279 else: 1280 raise ValueError( 1281 "There is not option with the value %r" % item)

1282

1283 - def __repr__(self):

1284 return '<%s {%s} for select name=%r>' % ( 1285 self.__class__.__name__, 1286 ', '.join([repr(v) for v in self]), 1287 self.select.name)

1288

1289 -class RadioGroup(list):

1290 """ 1291 This object represents several ``<input type=radio>`` elements 1292 that have the same name. 1293 1294 You can use this like a list, but also use the property 1295 ``.value`` to check/uncheck inputs. Also you can use 1296 ``.value_options`` to get the possible values. 1297 """ 1298

1299 - def _value__get(self):

1300 """ 1301 Get/set the value, which checks the radio with that value (and 1302 unchecks any other value). 1303 """ 1304 for el in self: 1305 if 'checked' in el.attrib: 1306 return el.get('value') 1307 return None

1308

1309 - def _value__set(self, value):

1310 if value is not None: 1311 for el in self: 1312 if el.get('value') == value: 1313 checked_option = el 1314 break 1315 else: 1316 raise ValueError( 1317 "There is no radio input with the value %r" % value) 1318 for el in self: 1319 if 'checked' in el.attrib: 1320 del el.attrib['checked'] 1321 if value is not None: 1322 checked_option.set('checked', '')

1323

1324 - def _value__del(self):

1325 self.value = None

1326 1327 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1328

1329 - def value_options(self):

1330 """ 1331 Returns a list of all the possible values. 1332 """ 1333 return [el.get('value') for el in self]

1334 value_options = property(value_options, doc=value_options.__doc__) 1335

1336 - def __repr__(self):

1337 return '%s(%s)' % ( 1338 self.__class__.__name__, 1339 list.__repr__(self))

1340

1341 -class CheckboxGroup(list):

1342 """ 1343 Represents a group of checkboxes (``<input type=checkbox>``) that 1344 have the same name. 1345 1346 In addition to using this like a list, the ``.value`` attribute 1347 returns a set-like object that you can add to or remove from to 1348 check and uncheck checkboxes. You can also use ``.value_options`` 1349 to get the possible values. 1350 """ 1351

1352 - def _value__get(self):

1353 """ 1354 Return a set-like object that can be modified to check or 1355 uncheck individual checkboxes according to their value. 1356 """ 1357 return CheckboxValues(self)

1358 - def _value__set(self, value):

1359 self.value.clear() 1360 if not hasattr(value, '__iter__'): 1361 raise ValueError( 1362 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1363 % (self[0].name, value)) 1364 self.value.update(value)

1365 - def _value__del(self):

1366 self.value.clear()

1367 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1368

1369 - def value_options(self):

1370 """ 1371 Returns a list of all the possible values. 1372 """ 1373 return [el.get('value') for el in self]

1374 value_options = property(value_options, doc=value_options.__doc__) 1375

1376 - def __repr__(self):

1377 return '%s(%s)' % ( 1378 self.__class__.__name__, list.__repr__(self))

1379

1380 -class CheckboxValues(SetMixin):

1381 1382 """ 1383 Represents the values of the checked checkboxes in a group of 1384 checkboxes with the same name. 1385 """ 1386

1387 - def __init__(self, group):

1388 self.group = group

1389

1390 - def __iter__(self):

1391 return iter([ 1392 el.get('value') 1393 for el in self.group 1394 if 'checked' in el.attrib])

1395

1396 - def add(self, value):

1397 for el in self.group: 1398 if el.get('value') == value: 1399 el.set('checked', '') 1400 break 1401 else: 1402 raise KeyError("No checkbox with value %r" % value)

1403

1404 - def remove(self, value):

1405 for el in self.group: 1406 if el.get('value') == value: 1407 if 'checked' in el.attrib: 1408 del el.attrib['checked'] 1409 else: 1410 raise KeyError( 1411 "The checkbox with value %r was already unchecked" % value) 1412 break 1413 else: 1414 raise KeyError( 1415 "No checkbox with value %r" % value)

1416

1417 - def __repr__(self):

1418 return '<%s {%s} for checkboxes name=%r>' % ( 1419 self.__class__.__name__, 1420 ', '.join([repr(v) for v in self]), 1421 self.group.name)

1422

1423 -class InputElement(InputMixin, HtmlElement):

1424 """ 1425 Represents an ``<input>`` element. 1426 1427 You can get the type with ``.type`` (which is lower-cased and 1428 defaults to ``'text'``). 1429 1430 Also you can get and set the value with ``.value`` 1431 1432 Checkboxes and radios have the attribute ``input.checkable == 1433 True`` (for all others it is false) and a boolean attribute 1434 ``.checked``. 1435 1436 """ 1437 1438 ## FIXME: I'm a little uncomfortable with the use of .checked

1439 - def _value__get(self):

1440 """ 1441 Get/set the value of this element, using the ``value`` attribute. 1442 1443 Also, if this is a checkbox and it has no value, this defaults 1444 to ``'on'``. If it is a checkbox or radio that is not 1445 checked, this returns None. 1446 """ 1447 if self.checkable: 1448 if self.checked: 1449 return self.get('value') or 'on' 1450 else: 1451 return None 1452 return self.get('value')

1453 - def _value__set(self, value):

1454 if self.checkable: 1455 if not value: 1456 self.checked = False 1457 else: 1458 self.checked = True 1459 if isinstance(value, basestring): 1460 self.set('value', value) 1461 else: 1462 self.set('value', value)

1463 - def _value__del(self):

1464 if self.checkable: 1465 self.checked = False 1466 else: 1467 if 'value' in self.attrib: 1468 del self.attrib['value']

1469 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1470

1471 - def _type__get(self):

1472 """ 1473 Return the type of this element (using the type attribute). 1474 """ 1475 return self.get('type', 'text').lower()

1476 - def _type__set(self, value):

1477 self.set('type', value)

1478 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1479

1480 - def checkable(self):

1481 """ 1482 Boolean: can this element be checked? 1483 """ 1484 return self.type in ['checkbox', 'radio']

1485 checkable = property(checkable, doc=checkable.__doc__) 1486

1487 - def _checked__get(self):

1488 """ 1489 Boolean attribute to get/set the presence of the ``checked`` 1490 attribute. 1491 1492 You can only use this on checkable input types. 1493 """ 1494 if not self.checkable: 1495 raise AttributeError('Not a checkable input type') 1496 return 'checked' in self.attrib

1497 - def _checked__set(self, value):

1498 if not self.checkable: 1499 raise AttributeError('Not a checkable input type') 1500 if value: 1501 self.set('checked', '') 1502 else: 1503 if 'checked' in self.attrib: 1504 del self.attrib['checked']

1505 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1506 1507 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1508

1509 -class LabelElement(HtmlElement):

1510 """ 1511 Represents a ``<label>`` element. 1512 1513 Label elements are linked to other elements with their ``for`` 1514 attribute. You can access this element with ``label.for_element``. 1515 """ 1516

1517 - def _for_element__get(self):

1518 """ 1519 Get/set the element this label points to. Return None if it 1520 can't be found. 1521 """ 1522 id = self.get('for') 1523 if not id: 1524 return None 1525 return self.body.get_element_by_id(id)

1526 - def _for_element__set(self, other):

1527 id = other.get('id') 1528 if not id: 1529 raise TypeError( 1530 "Element %r has no id attribute" % other) 1531 self.set('for', id)

1532 - def _for_element__del(self):

1533 if 'id' in self.attrib: 1534 del self.attrib['id']

1535 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1536 doc=_for_element__get.__doc__)

1537 1538 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1539 1540 ############################################################ 1541 ## Serialization 1542 ############################################################ 1543

1544 -def html_to_xhtml(html):

1545 """Convert all tags in an HTML tree to XHTML by moving them to the 1546 XHTML namespace. 1547 """ 1548 try: 1549 html = html.getroot() 1550 except AttributeError: 1551 pass 1552 prefix = "{%s}" % XHTML_NAMESPACE 1553 for el in html.iter(etree.Element): 1554 tag = el.tag 1555 if tag[0] != '{': 1556 el.tag = prefix + tag

1557

1558 -def xhtml_to_html(xhtml):

1559 """Convert all tags in an XHTML tree to HTML by removing their 1560 XHTML namespace. 1561 """ 1562 try: 1563 xhtml = xhtml.getroot() 1564 except AttributeError: 1565 pass 1566 prefix = "{%s}" % XHTML_NAMESPACE 1567 prefix_len = len(prefix) 1568 for el in xhtml.iter(prefix + "*"): 1569 el.tag = el.tag[prefix_len:]

1570 1571 # This isn't a general match, but it's a match for what libxml2 1572 # specifically serialises: 1573 __str_replace_meta_content_type = re.compile( 1574 r'<meta http-equiv="Content-Type"[^>]*>').sub 1575 __bytes_replace_meta_content_type = re.compile( 1576 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1577

1578 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1579 encoding=None, method="html", with_tail=True, doctype=None):

1580 """Return an HTML string representation of the document. 1581 1582 Note: if include_meta_content_type is true this will create a 1583 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1584 regardless of the value of include_meta_content_type any existing 1585 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1586 1587 The ``encoding`` argument controls the output encoding (defauts to 1588 ASCII, with &#...; character references for any characters outside 1589 of ASCII). Note that you can pass the name ``'unicode'`` as 1590 ``encoding`` argument to serialise to a Unicode string. 1591 1592 The ``method`` argument defines the output method. It defaults to 1593 'html', but can also be 'xml' for xhtml output, or 'text' to 1594 serialise to plain text without markup. 1595 1596 To leave out the tail text of the top-level element that is being 1597 serialised, pass ``with_tail=False``. 1598 1599 The ``doctype`` option allows passing in a plain string that will 1600 be serialised before the XML tree. Note that passing in non 1601 well-formed content here will make the XML output non well-formed. 1602 Also, an existing doctype in the document tree will not be removed 1603 when serialising an ElementTree instance. 1604 1605 Example:: 1606 1607 >>> from lxml import html 1608 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1609 1610 >>> html.tostring(root) 1611 b'<p>Hello<br>world!</p>' 1612 >>> html.tostring(root, method='html') 1613 b'<p>Hello<br>world!</p>' 1614 1615 >>> html.tostring(root, method='xml') 1616 b'<p>Hello<br/>world!</p>' 1617 1618 >>> html.tostring(root, method='text') 1619 b'Helloworld!' 1620 1621 >>> html.tostring(root, method='text', encoding='unicode') 1622 u'Helloworld!' 1623 1624 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1625 >>> html.tostring(root[0], method='text', encoding='unicode') 1626 u'Helloworld!TAIL' 1627 1628 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1629 u'Helloworld!' 1630 1631 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1632 >>> html.tostring(doc, method='html', encoding='unicode') 1633 u'<html><body><p>Hello<br>world!</p></body></html>' 1634 1635 >>> print(html.tostring(doc, method='html', encoding='unicode', 1636 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1637 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1638 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1639 <html><body><p>Hello<br>world!</p></body></html> 1640 """ 1641 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1642 encoding=encoding, with_tail=with_tail, 1643 doctype=doctype) 1644 if method == 'html' and not include_meta_content_type: 1645 if isinstance(html, str): 1646 html = __str_replace_meta_content_type('', html) 1647 else: 1648 html = __bytes_replace_meta_content_type(bytes(), html) 1649 return html

1650 1651 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1652

1653 -def open_in_browser(doc, encoding=None):

1654 """ 1655 Open the HTML document in a web browser, saving it to a temporary 1656 file to open it. Note that this does not delete the file after 1657 use. This is mainly meant for debugging. 1658 """ 1659 import os 1660 import webbrowser 1661 import tempfile 1662 if not isinstance(doc, etree._ElementTree): 1663 doc = etree.ElementTree(doc) 1664 handle, fn = tempfile.mkstemp(suffix='.html') 1665 f = os.fdopen(handle, 'wb') 1666 try: 1667 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1668 finally: 1669 # we leak the file itself here, but we should at least close it 1670 f.close() 1671 url = 'file://' + fn.replace(os.path.sep, '/') 1672 print(url) 1673 webbrowser.open(url)

1674 1675 ################################################################################ 1676 # configure Element class lookup 1677 ################################################################################ 1678

1679 -class HTMLParser(etree.HTMLParser):

1680 """An HTML parser that is configured to return lxml.html Element 1681 objects. 1682 """

1683 - def __init__(self, **kwargs):

1684 super(HTMLParser, self).__init__(**kwargs) 1685 self.set_element_class_lookup(HtmlElementClassLookup())

1686

1687 -class XHTMLParser(etree.XMLParser):

1688 """An XML parser that is configured to return lxml.html Element 1689 objects. 1690 1691 Note that this parser is not really XHTML aware unless you let it 1692 load a DTD that declares the HTML entities. To do this, make sure 1693 you have the XHTML DTDs installed in your catalogs, and create the 1694 parser like this:: 1695 1696 >>> parser = XHTMLParser(load_dtd=True) 1697 1698 If you additionally want to validate the document, use this:: 1699 1700 >>> parser = XHTMLParser(dtd_validation=True) 1701 1702 For catalog support, see http://www.xmlsoft.org/catalog.html. 1703 """

1704 - def __init__(self, **kwargs):

1705 super(XHTMLParser, self).__init__(**kwargs) 1706 self.set_element_class_lookup(HtmlElementClassLookup())

1707

1708 -def Element(*args, **kw):

1709 """Create a new HTML Element. 1710 1711 This can also be used for XHTML documents. 1712 """ 1713 v = html_parser.makeelement(*args, **kw) 1714 return v

1715 1716 html_parser = HTMLParser() 1717 xhtml_parser = XHTMLParser() 1718

Source Code for Package lxml.html