lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 from __future__ import absolute_import 35 36 __all__ = [ 37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 39 'find_rel_links', 'find_class', 'make_links_absolute', 40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 41 42 43 import copy 44 import sys 45 import re 46 from functools import partial 47 48 try: 49 # while unnecessary, importing from 'collections.abc' is the right way to do it 50 from collections.abc import MutableMapping, MutableSet 51 except ImportError: 52 from collections import MutableMapping, MutableSet 53 54 from .. import etree 55 from . import defs 56 from ._setmixin import SetMixin 57 58 try: 59 from urlparse import urljoin 60 except ImportError: 61 # Python 3 62 from urllib.parse import urljoin 63 64 try: 65 unicode 66 except NameError: 67 # Python 3 68 unicode = str 69 try: 70 basestring 71 except NameError: 72 # Python 3 73 basestring = (str, bytes)

74 75 76 -def __fix_docstring(s):

77 if not s: 78 return s 79 if sys.version_info[0] >= 3: 80 sub = re.compile(r"^(\s*)u'", re.M).sub 81 else: 82 sub = re.compile(r"^(\s*)b'", re.M).sub 83 return sub(r"\1'", s)

84 85 86 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 87 88 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 89 namespaces={'x':XHTML_NAMESPACE}) 90 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 91 namespaces={'x':XHTML_NAMESPACE}) 92 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 93 namespaces={'x':XHTML_NAMESPACE}) 94 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 95 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 96 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 97 _collect_string_content = etree.XPath("string()") 98 _iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer 99 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 100 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 101 namespaces={'x':XHTML_NAMESPACE}) 102 _archive_re = re.compile(r'[^ ]+') 103 _parse_meta_refresh_url = re.compile( 104 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search

105 106 107 -def _unquote_match(s, pos):

108 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 109 return s[1:-1], pos+1 110 else: 111 return s,pos

112

113 114 -def _transform_result(typ, result):

115 """Convert the result back into the input type. 116 """ 117 if issubclass(typ, bytes): 118 return tostring(result, encoding='utf-8') 119 elif issubclass(typ, unicode): 120 return tostring(result, encoding='unicode') 121 else: 122 return result

123

124 125 -def _nons(tag):

126 if isinstance(tag, basestring): 127 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 128 return tag.split('}')[-1] 129 return tag

130

131 132 -class Classes(MutableSet):

133 """Provides access to an element's class attribute as a set-like collection. 134 Usage:: 135 136 >>> el = fromstring('<p class="hidden large">Text</p>') 137 >>> classes = el.classes # or: classes = Classes(el.attrib) 138 >>> classes |= ['block', 'paragraph'] 139 >>> el.get('class') 140 'hidden large block paragraph' 141 >>> classes.toggle('hidden') 142 False 143 >>> el.get('class') 144 'large block paragraph' 145 >>> classes -= ('some', 'classes', 'block') 146 >>> el.get('class') 147 'large paragraph' 148 """

149 - def __init__(self, attributes):

150 self._attributes = attributes 151 self._get_class_value = partial(attributes.get, 'class', '')

152

153 - def add(self, value):

154 """ 155 Add a class. 156 157 This has no effect if the class is already present. 158 """ 159 if not value or re.search(r'\s', value): 160 raise ValueError("Invalid class name: %r" % value) 161 classes = self._get_class_value().split() 162 if value in classes: 163 return 164 classes.append(value) 165 self._attributes['class'] = ' '.join(classes)

166

167 - def discard(self, value):

168 """ 169 Remove a class if it is currently present. 170 171 If the class is not present, do nothing. 172 """ 173 if not value or re.search(r'\s', value): 174 raise ValueError("Invalid class name: %r" % value) 175 classes = [name for name in self._get_class_value().split() 176 if name != value] 177 if classes: 178 self._attributes['class'] = ' '.join(classes) 179 elif 'class' in self._attributes: 180 del self._attributes['class']

181

182 - def remove(self, value):

183 """ 184 Remove a class; it must currently be present. 185 186 If the class is not present, raise a KeyError. 187 """ 188 if not value or re.search(r'\s', value): 189 raise ValueError("Invalid class name: %r" % value) 190 super(Classes, self).remove(value)

191

192 - def __contains__(self, name):

193 classes = self._get_class_value() 194 return name in classes and name in classes.split()

195

196 - def __iter__(self):

197 return iter(self._get_class_value().split())

198

199 - def __len__(self):

200 return len(self._get_class_value().split())

201 202 # non-standard methods 203

204 - def update(self, values):

205 """ 206 Add all names from 'values'. 207 """ 208 classes = self._get_class_value().split() 209 extended = False 210 for value in values: 211 if value not in classes: 212 classes.append(value) 213 extended = True 214 if extended: 215 self._attributes['class'] = ' '.join(classes)

216

217 - def toggle(self, value):

218 """ 219 Add a class name if it isn't there yet, or remove it if it exists. 220 221 Returns true if the class was added (and is now enabled) and 222 false if it was removed (and is now disabled). 223 """ 224 if not value or re.search(r'\s', value): 225 raise ValueError("Invalid class name: %r" % value) 226 classes = self._get_class_value().split() 227 try: 228 classes.remove(value) 229 enabled = False 230 except ValueError: 231 classes.append(value) 232 enabled = True 233 if classes: 234 self._attributes['class'] = ' '.join(classes) 235 else: 236 del self._attributes['class'] 237 return enabled

238

239 240 -class HtmlMixin(object):

241

242 - def set(self, key, value=None):

243 """set(self, key, value=None) 244 245 Sets an element attribute. If no value is provided, or if the value is None, 246 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 247 for ``form.set('novalidate')``. 248 """ 249 super(HtmlElement, self).set(key, value)

250 251 @property

252 - def classes(self):

253 """ 254 A set-like wrapper around the 'class' attribute. 255 """ 256 return Classes(self.attrib)

257 258 @classes.setter

259 - def classes(self, classes):

260 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 261 value = classes._get_class_value() 262 if value: 263 self.set('class', value) 264 elif self.get('class') is not None: 265 del self.attrib['class']

266 267 @property

268 - def base_url(self):

269 """ 270 Returns the base URL, given when the page was parsed. 271 272 Use with ``urlparse.urljoin(el.base_url, href)`` to get 273 absolute URLs. 274 """ 275 return self.getroottree().docinfo.URL

276 277 @property

278 - def forms(self):

279 """ 280 Return a list of all the forms 281 """ 282 return _forms_xpath(self)

283 284 @property

285 - def body(self):

286 """ 287 Return the <body> element. Can be called from a child element 288 to get the document's head. 289 """ 290 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

291 292 @property

293 - def head(self):

294 """ 295 Returns the <head> element. Can be called from a child 296 element to get the document's head. 297 """ 298 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

299 300 @property

301 - def label(self):

302 """ 303 Get or set any <label> element associated with this element. 304 """ 305 id = self.get('id') 306 if not id: 307 return None 308 result = _label_xpath(self, id=id) 309 if not result: 310 return None 311 else: 312 return result[0]

313 314 @label.setter

315 - def label(self, label):

316 id = self.get('id') 317 if not id: 318 raise TypeError( 319 "You cannot set a label for an element (%r) that has no id" 320 % self) 321 if _nons(label.tag) != 'label': 322 raise TypeError( 323 "You can only assign label to a label element (not %r)" 324 % label) 325 label.set('for', id)

326 327 @label.deleter

328 - def label(self):

329 label = self.label 330 if label is not None: 331 del label.attrib['for']

332

333 - def drop_tree(self):

334 """ 335 Removes this element from the tree, including its children and 336 text. The tail text is joined to the previous element or 337 parent. 338 """ 339 parent = self.getparent() 340 assert parent is not None 341 if self.tail: 342 previous = self.getprevious() 343 if previous is None: 344 parent.text = (parent.text or '') + self.tail 345 else: 346 previous.tail = (previous.tail or '') + self.tail 347 parent.remove(self)

348

349 - def drop_tag(self):

350 """ 351 Remove the tag, but not its children or text. The children and text 352 are merged into the parent. 353 354 Example:: 355 356 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 357 >>> h.find('.//b').drop_tag() 358 >>> print(tostring(h, encoding='unicode')) 359 <div>Hello World!</div> 360 """ 361 parent = self.getparent() 362 assert parent is not None 363 previous = self.getprevious() 364 if self.text and isinstance(self.tag, basestring): 365 # not a Comment, etc. 366 if previous is None: 367 parent.text = (parent.text or '') + self.text 368 else: 369 previous.tail = (previous.tail or '') + self.text 370 if self.tail: 371 if len(self): 372 last = self[-1] 373 last.tail = (last.tail or '') + self.tail 374 elif previous is None: 375 parent.text = (parent.text or '') + self.tail 376 else: 377 previous.tail = (previous.tail or '') + self.tail 378 index = parent.index(self) 379 parent[index:index+1] = self[:]

380

381 - def find_rel_links(self, rel):

382 """ 383 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 384 """ 385 rel = rel.lower() 386 return [el for el in _rel_links_xpath(self) 387 if el.get('rel').lower() == rel]

388

389 - def find_class(self, class_name):

390 """ 391 Find any elements with the given class name. 392 """ 393 return _class_xpath(self, class_name=class_name)

394

395 - def get_element_by_id(self, id, *default):

396 """ 397 Get the first element in a document with the given id. If none is 398 found, return the default argument if provided or raise KeyError 399 otherwise. 400 401 Note that there can be more than one element with the same id, 402 and this isn't uncommon in HTML documents found in the wild. 403 Browsers return only the first match, and this function does 404 the same. 405 """ 406 try: 407 # FIXME: should this check for multiple matches? 408 # browsers just return the first one 409 return _id_xpath(self, id=id)[0] 410 except IndexError: 411 if default: 412 return default[0] 413 else: 414 raise KeyError(id)

415

416 - def text_content(self):

417 """ 418 Return the text content of the tag (and the text in any children). 419 """ 420 return _collect_string_content(self)

421

422 - def cssselect(self, expr, translator='html'):

423 """ 424 Run the CSS expression on this element and its children, 425 returning a list of the results. 426 427 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 428 -- note that pre-compiling the expression can provide a substantial 429 speedup. 430 """ 431 # Do the import here to make the dependency optional. 432 from lxml.cssselect import CSSSelector 433 return CSSSelector(expr, translator=translator)(self)

434 435 ######################################## 436 ## Link functions 437 ######################################## 438

439 - def make_links_absolute(self, base_url=None, resolve_base_href=True, 440 handle_failures=None):

441 """ 442 Make all links in the document absolute, given the 443 ``base_url`` for the document (the full URL where the document 444 came from), or if no ``base_url`` is given, then the ``.base_url`` 445 of the document. 446 447 If ``resolve_base_href`` is true, then any ``<base href>`` 448 tags in the document are used *and* removed from the document. 449 If it is false then any such tag is ignored. 450 451 If ``handle_failures`` is None (default), a failure to process 452 a URL will abort the processing. If set to 'ignore', errors 453 are ignored. If set to 'discard', failing URLs will be removed. 454 """ 455 if base_url is None: 456 base_url = self.base_url 457 if base_url is None: 458 raise TypeError( 459 "No base_url given, and the document has no base_url") 460 if resolve_base_href: 461 self.resolve_base_href() 462 463 if handle_failures == 'ignore': 464 def link_repl(href): 465 try: 466 return urljoin(base_url, href) 467 except ValueError: 468 return href

469 elif handle_failures == 'discard': 470 def link_repl(href): 471 try: 472 return urljoin(base_url, href) 473 except ValueError: 474 return None

475 elif handle_failures is None: 476 def link_repl(href): 477 return urljoin(base_url, href) 478 else: 479 raise ValueError( 480 "unexpected value for handle_failures: %r" % handle_failures) 481 482 self.rewrite_links(link_repl) 483

484 - def resolve_base_href(self, handle_failures=None):

485 """ 486 Find any ``<base href>`` tag in the document, and apply its 487 values to all links found in the document. Also remove the 488 tag once it has been applied. 489 490 If ``handle_failures`` is None (default), a failure to process 491 a URL will abort the processing. If set to 'ignore', errors 492 are ignored. If set to 'discard', failing URLs will be removed. 493 """ 494 base_href = None 495 basetags = self.xpath('//base[@href]|//x:base[@href]', 496 namespaces={'x': XHTML_NAMESPACE}) 497 for b in basetags: 498 base_href = b.get('href') 499 b.drop_tree() 500 if not base_href: 501 return 502 self.make_links_absolute(base_href, resolve_base_href=False, 503 handle_failures=handle_failures)

504

505 - def iterlinks(self):

506 """ 507 Yield (element, attribute, link, pos), where attribute may be None 508 (indicating the link is in the text). ``pos`` is the position 509 where the link occurs; often 0, but sometimes something else in 510 the case of links in stylesheets or style tags. 511 512 Note: <base href> is *not* taken into account in any way. The 513 link you get is exactly the link in the document. 514 515 Note: multiple links inside of a single text string or 516 attribute value are returned in reversed order. This makes it 517 possible to replace or delete them from the text string value 518 based on their reported text positions. Otherwise, a 519 modification at one text position can change the positions of 520 links reported later on. 521 """ 522 link_attrs = defs.link_attrs 523 for el in self.iter(etree.Element): 524 attribs = el.attrib 525 tag = _nons(el.tag) 526 if tag == 'object': 527 codebase = None 528 ## <object> tags have attributes that are relative to 529 ## codebase 530 if 'codebase' in attribs: 531 codebase = el.get('codebase') 532 yield (el, 'codebase', codebase, 0) 533 for attrib in ('classid', 'data'): 534 if attrib in attribs: 535 value = el.get(attrib) 536 if codebase is not None: 537 value = urljoin(codebase, value) 538 yield (el, attrib, value, 0) 539 if 'archive' in attribs: 540 for match in _archive_re.finditer(el.get('archive')): 541 value = match.group(0) 542 if codebase is not None: 543 value = urljoin(codebase, value) 544 yield (el, 'archive', value, match.start()) 545 else: 546 for attrib in link_attrs: 547 if attrib in attribs: 548 yield (el, attrib, attribs[attrib], 0) 549 if tag == 'meta': 550 http_equiv = attribs.get('http-equiv', '').lower() 551 if http_equiv == 'refresh': 552 content = attribs.get('content', '') 553 match = _parse_meta_refresh_url(content) 554 url = (match.group('url') if match else content).strip() 555 # unexpected content means the redirect won't work, but we might 556 # as well be permissive and return the entire string. 557 if url: 558 url, pos = _unquote_match( 559 url, match.start('url') if match else content.find(url)) 560 yield (el, 'content', url, pos) 561 elif tag == 'param': 562 valuetype = el.get('valuetype') or '' 563 if valuetype.lower() == 'ref': 564 ## FIXME: while it's fine we *find* this link, 565 ## according to the spec we aren't supposed to 566 ## actually change the value, including resolving 567 ## it. It can also still be a link, even if it 568 ## doesn't have a valuetype="ref" (which seems to be the norm) 569 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 570 yield (el, 'value', el.get('value'), 0) 571 elif tag == 'style' and el.text: 572 urls = [ 573 # (start_pos, url) 574 _unquote_match(match.group(1), match.start(1))[::-1] 575 for match in _iter_css_urls(el.text) 576 ] + [ 577 (match.start(1), match.group(1)) 578 for match in _iter_css_imports(el.text) 579 ] 580 if urls: 581 # sort by start pos to bring both match sets back into order 582 # and reverse the list to report correct positions despite 583 # modifications 584 urls.sort(reverse=True) 585 for start, url in urls: 586 yield (el, None, url, start) 587 if 'style' in attribs: 588 urls = list(_iter_css_urls(attribs['style'])) 589 if urls: 590 # return in reversed order to simplify in-place modifications 591 for match in urls[::-1]: 592 url, start = _unquote_match(match.group(1), match.start(1)) 593 yield (el, 'style', url, start)

594

595 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 596 base_href=None):

597 """ 598 Rewrite all the links in the document. For each link 599 ``link_repl_func(link)`` will be called, and the return value 600 will replace the old link. 601 602 Note that links may not be absolute (unless you first called 603 ``make_links_absolute()``), and may be internal (e.g., 604 ``'#anchor'``). They can also be values like 605 ``'mailto:email'`` or ``'javascript:expr'``. 606 607 If you give ``base_href`` then all links passed to 608 ``link_repl_func()`` will take that into account. 609 610 If the ``link_repl_func`` returns None, the attribute or 611 tag text will be removed completely. 612 """ 613 if base_href is not None: 614 # FIXME: this can be done in one pass with a wrapper 615 # around link_repl_func 616 self.make_links_absolute( 617 base_href, resolve_base_href=resolve_base_href) 618 elif resolve_base_href: 619 self.resolve_base_href() 620 621 for el, attrib, link, pos in self.iterlinks(): 622 new_link = link_repl_func(link.strip()) 623 if new_link == link: 624 continue 625 if new_link is None: 626 # Remove the attribute or element content 627 if attrib is None: 628 el.text = '' 629 else: 630 del el.attrib[attrib] 631 continue 632 633 if attrib is None: 634 new = el.text[:pos] + new_link + el.text[pos+len(link):] 635 el.text = new 636 else: 637 cur = el.get(attrib) 638 if not pos and len(cur) == len(link): 639 new = new_link # most common case 640 else: 641 new = cur[:pos] + new_link + cur[pos+len(link):] 642 el.set(attrib, new)

643

644 645 -class _MethodFunc(object):

646 """ 647 An object that represents a method on an element as a function; 648 the function takes either an element or an HTML string. It 649 returns whatever the function normally returns, or if the function 650 works in-place (and so returns None) it returns a serialized form 651 of the resulting document. 652 """

653 - def __init__(self, name, copy=False, source_class=HtmlMixin):

654 self.name = name 655 self.copy = copy 656 self.__doc__ = getattr(source_class, self.name).__doc__

657 - def __call__(self, doc, *args, **kw):

658 result_type = type(doc) 659 if isinstance(doc, basestring): 660 if 'copy' in kw: 661 raise TypeError( 662 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 663 doc = fromstring(doc, **kw) 664 else: 665 if 'copy' in kw: 666 make_a_copy = kw.pop('copy') 667 else: 668 make_a_copy = self.copy 669 if make_a_copy: 670 doc = copy.deepcopy(doc) 671 meth = getattr(doc, self.name) 672 result = meth(*args, **kw) 673 # FIXME: this None test is a bit sloppy 674 if result is None: 675 # Then return what we got in 676 return _transform_result(result_type, doc) 677 else: 678 return result

679 680 681 find_rel_links = _MethodFunc('find_rel_links', copy=False) 682 find_class = _MethodFunc('find_class', copy=False) 683 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 684 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 685 iterlinks = _MethodFunc('iterlinks', copy=False) 686 rewrite_links = _MethodFunc('rewrite_links', copy=True)

687 688 689 -class HtmlComment(etree.CommentBase, HtmlMixin):

690 pass

691

692 693 -class HtmlElement(etree.ElementBase, HtmlMixin):

694 # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?) 695 cssselect = HtmlMixin.cssselect 696 set = HtmlMixin.set

697

698 699 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

700 pass

701

702 703 -class HtmlEntity(etree.EntityBase, HtmlMixin):

704 pass

705

706 707 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

708 """A lookup scheme for HTML Element classes. 709 710 To create a lookup instance with different Element classes, pass a tag 711 name mapping of Element classes in the ``classes`` keyword argument and/or 712 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 713 The special key '*' denotes a Mixin class that should be mixed into all 714 Element classes. 715 """ 716 _default_element_classes = {} 717

718 - def __init__(self, classes=None, mixins=None):

719 etree.CustomElementClassLookup.__init__(self) 720 if classes is None: 721 classes = self._default_element_classes.copy() 722 if mixins: 723 mixers = {} 724 for name, value in mixins: 725 if name == '*': 726 for n in classes.keys(): 727 mixers.setdefault(n, []).append(value) 728 else: 729 mixers.setdefault(name, []).append(value) 730 for name, mix_bases in mixers.items(): 731 cur = classes.get(name, HtmlElement) 732 bases = tuple(mix_bases + [cur]) 733 classes[name] = type(cur.__name__, bases, {}) 734 self._element_classes = classes

735

736 - def lookup(self, node_type, document, namespace, name):

737 if node_type == 'element': 738 return self._element_classes.get(name.lower(), HtmlElement) 739 elif node_type == 'comment': 740 return HtmlComment 741 elif node_type == 'PI': 742 return HtmlProcessingInstruction 743 elif node_type == 'entity': 744 return HtmlEntity 745 # Otherwise normal lookup 746 return None

747 748 749 ################################################################################ 750 # parsing 751 ################################################################################ 752 753 _looks_like_full_html_unicode = re.compile( 754 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 755 _looks_like_full_html_bytes = re.compile( 756 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match

757 758 759 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

760 if parser is None: 761 parser = html_parser 762 value = etree.fromstring(html, parser, **kw) 763 if value is None: 764 raise etree.ParserError( 765 "Document is empty") 766 if ensure_head_body and value.find('head') is None: 767 value.insert(0, Element('head')) 768 if ensure_head_body and value.find('body') is None: 769 value.append(Element('body')) 770 return value

771

772 773 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 774 parser=None, **kw):

775 """Parses several HTML elements, returning a list of elements. 776 777 The first item in the list may be a string. 778 If no_leading_text is true, then it will be an error if there is 779 leading text, and it will always be a list of only elements. 780 781 base_url will set the document's base_url attribute 782 (and the tree's docinfo.URL). 783 """ 784 if parser is None: 785 parser = html_parser 786 # FIXME: check what happens when you give html with a body, head, etc. 787 if isinstance(html, bytes): 788 if not _looks_like_full_html_bytes(html): 789 # can't use %-formatting in early Py3 versions 790 html = ('<html><body>'.encode('ascii') + html + 791 '</body></html>'.encode('ascii')) 792 else: 793 if not _looks_like_full_html_unicode(html): 794 html = '<html><body>%s</body></html>' % html 795 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 796 assert _nons(doc.tag) == 'html' 797 bodies = [e for e in doc if _nons(e.tag) == 'body'] 798 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 799 body = bodies[0] 800 elements = [] 801 if no_leading_text and body.text and body.text.strip(): 802 raise etree.ParserError( 803 "There is leading text: %r" % body.text) 804 if body.text and body.text.strip(): 805 elements.append(body.text) 806 elements.extend(body) 807 # FIXME: removing the reference to the parent artificial document 808 # would be nice 809 return elements

810

811 812 -def fragment_fromstring(html, create_parent=False, base_url=None, 813 parser=None, **kw):

814 """ 815 Parses a single HTML element; it is an error if there is more than 816 one element, or if anything but whitespace precedes or follows the 817 element. 818 819 If ``create_parent`` is true (or is a tag name) then a parent node 820 will be created to encapsulate the HTML in a single element. In this 821 case, leading or trailing text is also allowed, as are multiple elements 822 as result of the parsing. 823 824 Passing a ``base_url`` will set the document's ``base_url`` attribute 825 (and the tree's docinfo.URL). 826 """ 827 if parser is None: 828 parser = html_parser 829 830 accept_leading_text = bool(create_parent) 831 832 elements = fragments_fromstring( 833 html, parser=parser, no_leading_text=not accept_leading_text, 834 base_url=base_url, **kw) 835 836 if create_parent: 837 if not isinstance(create_parent, basestring): 838 create_parent = 'div' 839 new_root = Element(create_parent) 840 if elements: 841 if isinstance(elements[0], basestring): 842 new_root.text = elements[0] 843 del elements[0] 844 new_root.extend(elements) 845 return new_root 846 847 if not elements: 848 raise etree.ParserError('No elements found') 849 if len(elements) > 1: 850 raise etree.ParserError( 851 "Multiple elements found (%s)" 852 % ', '.join([_element_name(e) for e in elements])) 853 el = elements[0] 854 if el.tail and el.tail.strip(): 855 raise etree.ParserError( 856 "Element followed by text: %r" % el.tail) 857 el.tail = None 858 return el

859

860 861 -def fromstring(html, base_url=None, parser=None, **kw):

862 """ 863 Parse the html, returning a single element/document. 864 865 This tries to minimally parse the chunk of text, without knowing if it 866 is a fragment or a document. 867 868 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 869 """ 870 if parser is None: 871 parser = html_parser 872 if isinstance(html, bytes): 873 is_full_html = _looks_like_full_html_bytes(html) 874 else: 875 is_full_html = _looks_like_full_html_unicode(html) 876 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 877 if is_full_html: 878 return doc 879 # otherwise, lets parse it out... 880 bodies = doc.findall('body') 881 if not bodies: 882 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 883 if bodies: 884 body = bodies[0] 885 if len(bodies) > 1: 886 # Somehow there are multiple bodies, which is bad, but just 887 # smash them into one body 888 for other_body in bodies[1:]: 889 if other_body.text: 890 if len(body): 891 body[-1].tail = (body[-1].tail or '') + other_body.text 892 else: 893 body.text = (body.text or '') + other_body.text 894 body.extend(other_body) 895 # We'll ignore tail 896 # I guess we are ignoring attributes too 897 other_body.drop_tree() 898 else: 899 body = None 900 heads = doc.findall('head') 901 if not heads: 902 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 903 if heads: 904 # Well, we have some sort of structure, so lets keep it all 905 head = heads[0] 906 if len(heads) > 1: 907 for other_head in heads[1:]: 908 head.extend(other_head) 909 # We don't care about text or tail in a head 910 other_head.drop_tree() 911 return doc 912 if body is None: 913 return doc 914 if (len(body) == 1 and (not body.text or not body.text.strip()) 915 and (not body[-1].tail or not body[-1].tail.strip())): 916 # The body has just one element, so it was probably a single 917 # element passed in 918 return body[0] 919 # Now we have a body which represents a bunch of tags which have the 920 # content that was passed in. We will create a fake container, which 921 # is the body tag, except <body> implies too much structure. 922 if _contains_block_level_tag(body): 923 body.tag = 'div' 924 else: 925 body.tag = 'span' 926 return body

927

928 929 -def parse(filename_or_url, parser=None, base_url=None, **kw):

930 """ 931 Parse a filename, URL, or file-like object into an HTML document 932 tree. Note: this returns a tree, not an element. Use 933 ``parse(...).getroot()`` to get the document root. 934 935 You can override the base URL with the ``base_url`` keyword. This 936 is most useful when parsing from a file-like object. 937 """ 938 if parser is None: 939 parser = html_parser 940 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

941

942 943 -def _contains_block_level_tag(el):

944 # FIXME: I could do this with XPath, but would that just be 945 # unnecessarily slow? 946 for el in el.iter(etree.Element): 947 if _nons(el.tag) in defs.block_tags: 948 return True 949 return False

950

951 952 -def _element_name(el):

953 if isinstance(el, etree.CommentBase): 954 return 'comment' 955 elif isinstance(el, basestring): 956 return 'string' 957 else: 958 return _nons(el.tag)

959

960 961 ################################################################################ 962 # form handling 963 ################################################################################ 964 965 -class FormElement(HtmlElement):

966 """ 967 Represents a <form> element. 968 """ 969 970 @property

971 - def inputs(self):

972 """ 973 Returns an accessor for all the input elements in the form. 974 975 See `InputGetter` for more information about the object. 976 """ 977 return InputGetter(self)

978 979 @property

980 - def fields(self):

981 """ 982 Dictionary-like object that represents all the fields in this 983 form. You can set values in this dictionary to effect the 984 form. 985 """ 986 return FieldsDict(self.inputs)

987 988 @fields.setter

989 - def fields(self, value):

990 fields = self.fields 991 prev_keys = fields.keys() 992 for key, value in value.items(): 993 if key in prev_keys: 994 prev_keys.remove(key) 995 fields[key] = value 996 for key in prev_keys: 997 if key is None: 998 # Case of an unnamed input; these aren't really 999 # expressed in form_values() anyway. 1000 continue 1001 fields[key] = None

1002

1003 - def _name(self):

1004 if self.get('name'): 1005 return self.get('name') 1006 elif self.get('id'): 1007 return '#' + self.get('id') 1008 iter_tags = self.body.iter 1009 forms = list(iter_tags('form')) 1010 if not forms: 1011 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 1012 return str(forms.index(self))

1013

1014 - def form_values(self):

1015 """ 1016 Return a list of tuples of the field values for the form. 1017 This is suitable to be passed to ``urllib.urlencode()``. 1018 """ 1019 results = [] 1020 for el in self.inputs: 1021 name = el.name 1022 if not name or 'disabled' in el.attrib: 1023 continue 1024 tag = _nons(el.tag) 1025 if tag == 'textarea': 1026 results.append((name, el.value)) 1027 elif tag == 'select': 1028 value = el.value 1029 if el.multiple: 1030 for v in value: 1031 results.append((name, v)) 1032 elif value is not None: 1033 results.append((name, el.value)) 1034 else: 1035 assert tag == 'input', ( 1036 "Unexpected tag: %r" % el) 1037 if el.checkable and not el.checked: 1038 continue 1039 if el.type in ('submit', 'image', 'reset', 'file'): 1040 continue 1041 value = el.value 1042 if value is not None: 1043 results.append((name, el.value)) 1044 return results

1045 1046 @property

1047 - def action(self):

1048 """ 1049 Get/set the form's ``action`` attribute. 1050 """ 1051 base_url = self.base_url 1052 action = self.get('action') 1053 if base_url and action is not None: 1054 return urljoin(base_url, action) 1055 else: 1056 return action

1057 1058 @action.setter

1059 - def action(self, value):

1060 self.set('action', value)

1061 1062 @action.deleter

1063 - def action(self):

1064 attrib = self.attrib 1065 if 'action' in attrib: 1066 del attrib['action']

1067 1068 @property

1069 - def method(self):

1070 """ 1071 Get/set the form's method. Always returns a capitalized 1072 string, and defaults to ``'GET'`` 1073 """ 1074 return self.get('method', 'GET').upper()

1075 1076 @method.setter

1077 - def method(self, value):

1078 self.set('method', value.upper())

1079 1080 1081 HtmlElementClassLookup._default_element_classes['form'] = FormElement

1082 1083 1084 -def submit_form(form, extra_values=None, open_http=None):

1085 """ 1086 Helper function to submit a form. Returns a file-like object, as from 1087 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 1088 which shows the URL if there were any redirects. 1089 1090 You can use this like:: 1091 1092 form = doc.forms[0] 1093 form.inputs['foo'].value = 'bar' # etc 1094 response = form.submit() 1095 doc = parse(response) 1096 doc.make_links_absolute(response.geturl()) 1097 1098 To change the HTTP requester, pass a function as ``open_http`` keyword 1099 argument that opens the URL for you. The function must have the following 1100 signature:: 1101 1102 open_http(method, URL, values) 1103 1104 The action is one of 'GET' or 'POST', the URL is the target URL as a 1105 string, and the values are a sequence of ``(name, value)`` tuples with the 1106 form data. 1107 """ 1108 values = form.form_values() 1109 if extra_values: 1110 if hasattr(extra_values, 'items'): 1111 extra_values = extra_values.items() 1112 values.extend(extra_values) 1113 if open_http is None: 1114 open_http = open_http_urllib 1115 if form.action: 1116 url = form.action 1117 else: 1118 url = form.base_url 1119 return open_http(form.method, url, values)

1120

1121 1122 -def open_http_urllib(method, url, values):

1123 if not url: 1124 raise ValueError("cannot submit, no URL provided") 1125 ## FIXME: should test that it's not a relative URL or something 1126 try: 1127 from urllib import urlencode, urlopen 1128 except ImportError: # Python 3 1129 from urllib.request import urlopen 1130 from urllib.parse import urlencode 1131 if method == 'GET': 1132 if '?' in url: 1133 url += '&' 1134 else: 1135 url += '?' 1136 url += urlencode(values) 1137 data = None 1138 else: 1139 data = urlencode(values) 1140 return urlopen(url, data)

1141

1142 1143 -class FieldsDict(MutableMapping):

1144

1145 - def __init__(self, inputs):

1146 self.inputs = inputs

1147 - def __getitem__(self, item):

1148 return self.inputs[item].value

1149 - def __setitem__(self, item, value):

1150 self.inputs[item].value = value

1151 - def __delitem__(self, item):

1152 raise KeyError( 1153 "You cannot remove keys from ElementDict")

1154 - def keys(self):

1155 return self.inputs.keys()

1156 - def __contains__(self, item):

1157 return item in self.inputs

1158 - def __iter__(self):

1159 return iter(self.inputs.keys())

1160 - def __len__(self):

1161 return len(self.inputs)

1162

1163 - def __repr__(self):

1164 return '<%s for form %s>' % ( 1165 self.__class__.__name__, 1166 self.inputs.form._name())

1167

1168 1169 -class InputGetter(object):

1170 1171 """ 1172 An accessor that represents all the input fields in a form. 1173 1174 You can get fields by name from this, with 1175 ``form.inputs['field_name']``. If there are a set of checkboxes 1176 with the same name, they are returned as a list (a `CheckboxGroup` 1177 which also allows value setting). Radio inputs are handled 1178 similarly. 1179 1180 You can also iterate over this to get all input elements. This 1181 won't return the same thing as if you get all the names, as 1182 checkboxes and radio elements are returned individually. 1183 """ 1184 1185 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1186 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1187

1188 - def __init__(self, form):

1189 self.form = form

1190

1191 - def __repr__(self):

1192 return '<%s for form %s>' % ( 1193 self.__class__.__name__, 1194 self.form._name())

1195 1196 ## FIXME: there should be more methods, and it's unclear if this is 1197 ## a dictionary-like object or list-like object 1198

1199 - def __getitem__(self, name):

1200 results = self._name_xpath(self.form, name=name) 1201 if results: 1202 type = results[0].get('type') 1203 if type == 'radio' and len(results) > 1: 1204 group = RadioGroup(results) 1205 group.name = name 1206 return group 1207 elif type == 'checkbox' and len(results) > 1: 1208 group = CheckboxGroup(results) 1209 group.name = name 1210 return group 1211 else: 1212 # I don't like throwing away elements like this 1213 return results[0] 1214 else: 1215 raise KeyError( 1216 "No input element with the name %r" % name)

1217

1218 - def __contains__(self, name):

1219 results = self._name_xpath(self.form, name=name) 1220 return bool(results)

1221

1222 - def keys(self):

1223 names = set() 1224 for el in self: 1225 names.add(el.name) 1226 if None in names: 1227 names.remove(None) 1228 return list(names)

1229

1230 - def __iter__(self):

1231 ## FIXME: kind of dumb to turn a list into an iterator, only 1232 ## to have it likely turned back into a list again :( 1233 return iter(self._all_xpath(self.form))

1234

1235 1236 -class InputMixin(object):

1237 """ 1238 Mix-in for all input elements (input, select, and textarea) 1239 """ 1240 @property

1241 - def name(self):

1242 """ 1243 Get/set the name of the element 1244 """ 1245 return self.get('name')

1246 1247 @name.setter

1248 - def name(self, value):

1249 self.set('name', value)

1250 1251 @name.deleter

1252 - def name(self):

1253 attrib = self.attrib 1254 if 'name' in attrib: 1255 del attrib['name']

1256

1257 - def __repr__(self):

1258 type_name = getattr(self, 'type', None) 1259 if type_name: 1260 type_name = ' type=%r' % type_name 1261 else: 1262 type_name = '' 1263 return '<%s %x name=%r%s>' % ( 1264 self.__class__.__name__, id(self), self.name, type_name)

1265

1266 1267 -class TextareaElement(InputMixin, HtmlElement):

1268 """ 1269 ``<textarea>`` element. You can get the name with ``.name`` and 1270 get/set the value with ``.value`` 1271 """ 1272 @property

1273 - def value(self):

1274 """ 1275 Get/set the value (which is the contents of this element) 1276 """ 1277 content = self.text or '' 1278 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1279 serialisation_method = 'xml' 1280 else: 1281 serialisation_method = 'html' 1282 for el in self: 1283 # it's rare that we actually get here, so let's not use ''.join() 1284 content += etree.tostring( 1285 el, method=serialisation_method, encoding='unicode') 1286 return content

1287 1288 @value.setter

1289 - def value(self, value):

1290 del self[:] 1291 self.text = value

1292 1293 @value.deleter

1294 - def value(self):

1295 self.text = '' 1296 del self[:]

1297 1298 1299 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement

1300 1301 1302 -class SelectElement(InputMixin, HtmlElement):

1303 """ 1304 ``<select>`` element. You can get the name with ``.name``. 1305 1306 ``.value`` will be the value of the selected option, unless this 1307 is a multi-select element (``<select multiple>``), in which case 1308 it will be a set-like object. In either case ``.value_options`` 1309 gives the possible values. 1310 1311 The boolean attribute ``.multiple`` shows if this is a 1312 multi-select. 1313 """ 1314 @property

1315 - def value(self):

1316 """ 1317 Get/set the value of this select (the selected option). 1318 1319 If this is a multi-select, this is a set-like object that 1320 represents all the selected options. 1321 """ 1322 if self.multiple: 1323 return MultipleSelectOptions(self) 1324 for el in _options_xpath(self): 1325 if el.get('selected') is not None: 1326 value = el.get('value') 1327 if value is None: 1328 value = el.text or '' 1329 if value: 1330 value = value.strip() 1331 return value 1332 return None

1333 1334 @value.setter

1335 - def value(self, value):

1336 if self.multiple: 1337 if isinstance(value, basestring): 1338 raise TypeError("You must pass in a sequence") 1339 values = self.value 1340 values.clear() 1341 values.update(value) 1342 return 1343 checked_option = None 1344 if value is not None: 1345 value = value.strip() 1346 for el in _options_xpath(self): 1347 opt_value = el.get('value') 1348 if opt_value is None: 1349 opt_value = el.text or '' 1350 if opt_value: 1351 opt_value = opt_value.strip() 1352 if opt_value == value: 1353 checked_option = el 1354 break 1355 else: 1356 raise ValueError( 1357 "There is no option with the value of %r" % value) 1358 for el in _options_xpath(self): 1359 if 'selected' in el.attrib: 1360 del el.attrib['selected'] 1361 if checked_option is not None: 1362 checked_option.set('selected', '')

1363 1364 @value.deleter

1365 - def value(self):

1366 # FIXME: should del be allowed at all? 1367 if self.multiple: 1368 self.value.clear() 1369 else: 1370 self.value = None

1371 1372 @property

1373 - def value_options(self):

1374 """ 1375 All the possible values this select can have (the ``value`` 1376 attribute of all the ``<option>`` elements. 1377 """ 1378 options = [] 1379 for el in _options_xpath(self): 1380 value = el.get('value') 1381 if value is None: 1382 value = el.text or '' 1383 if value: 1384 value = value.strip() 1385 options.append(value) 1386 return options

1387 1388 @property

1389 - def multiple(self):

1390 """ 1391 Boolean attribute: is there a ``multiple`` attribute on this element. 1392 """ 1393 return 'multiple' in self.attrib

1394 1395 @multiple.setter

1396 - def multiple(self, value):

1397 if value: 1398 self.set('multiple', '') 1399 elif 'multiple' in self.attrib: 1400 del self.attrib['multiple']

1401 1402 1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement

1404 1405 1406 -class MultipleSelectOptions(SetMixin):

1407 """ 1408 Represents all the selected options in a ``<select multiple>`` element. 1409 1410 You can add to this set-like option to select an option, or remove 1411 to unselect the option. 1412 """ 1413

1414 - def __init__(self, select):

1415 self.select = select

1416 1417 @property

1418 - def options(self):

1419 """ 1420 Iterator of all the ``<option>`` elements. 1421 """ 1422 return iter(_options_xpath(self.select))

1423

1424 - def __iter__(self):

1425 for option in self.options: 1426 if 'selected' in option.attrib: 1427 opt_value = option.get('value') 1428 if opt_value is None: 1429 opt_value = option.text or '' 1430 if opt_value: 1431 opt_value = opt_value.strip() 1432 yield opt_value

1433

1434 - def add(self, item):

1435 for option in self.options: 1436 opt_value = option.get('value') 1437 if opt_value is None: 1438 opt_value = option.text or '' 1439 if opt_value: 1440 opt_value = opt_value.strip() 1441 if opt_value == item: 1442 option.set('selected', '') 1443 break 1444 else: 1445 raise ValueError( 1446 "There is no option with the value %r" % item)

1447

1448 - def remove(self, item):

1449 for option in self.options: 1450 opt_value = option.get('value') 1451 if opt_value is None: 1452 opt_value = option.text or '' 1453 if opt_value: 1454 opt_value = opt_value.strip() 1455 if opt_value == item: 1456 if 'selected' in option.attrib: 1457 del option.attrib['selected'] 1458 else: 1459 raise ValueError( 1460 "The option %r is not currently selected" % item) 1461 break 1462 else: 1463 raise ValueError( 1464 "There is not option with the value %r" % item)

1465

1466 - def __repr__(self):

1467 return '<%s {%s} for select name=%r>' % ( 1468 self.__class__.__name__, 1469 ', '.join([repr(v) for v in self]), 1470 self.select.name)

1471

1472 1473 -class RadioGroup(list):

1474 """ 1475 This object represents several ``<input type=radio>`` elements 1476 that have the same name. 1477 1478 You can use this like a list, but also use the property 1479 ``.value`` to check/uncheck inputs. Also you can use 1480 ``.value_options`` to get the possible values. 1481 """ 1482 @property

1483 - def value(self):

1484 """ 1485 Get/set the value, which checks the radio with that value (and 1486 unchecks any other value). 1487 """ 1488 for el in self: 1489 if 'checked' in el.attrib: 1490 return el.get('value') 1491 return None

1492 1493 @value.setter

1494 - def value(self, value):

1495 checked_option = None 1496 if value is not None: 1497 for el in self: 1498 if el.get('value') == value: 1499 checked_option = el 1500 break 1501 else: 1502 raise ValueError("There is no radio input with the value %r" % value) 1503 for el in self: 1504 if 'checked' in el.attrib: 1505 del el.attrib['checked'] 1506 if checked_option is not None: 1507 checked_option.set('checked', '')

1508 1509 @value.deleter

1510 - def value(self):

1511 self.value = None

1512 1513 @property

1514 - def value_options(self):

1515 """ 1516 Returns a list of all the possible values. 1517 """ 1518 return [el.get('value') for el in self]

1519

1520 - def __repr__(self):

1521 return '%s(%s)' % ( 1522 self.__class__.__name__, 1523 list.__repr__(self))

1524

1525 1526 -class CheckboxGroup(list):

1527 """ 1528 Represents a group of checkboxes (``<input type=checkbox>``) that 1529 have the same name. 1530 1531 In addition to using this like a list, the ``.value`` attribute 1532 returns a set-like object that you can add to or remove from to 1533 check and uncheck checkboxes. You can also use ``.value_options`` 1534 to get the possible values. 1535 """ 1536 @property

1537 - def value(self):

1538 """ 1539 Return a set-like object that can be modified to check or 1540 uncheck individual checkboxes according to their value. 1541 """ 1542 return CheckboxValues(self)

1543 1544 @value.setter

1545 - def value(self, value):

1546 values = self.value 1547 values.clear() 1548 if not hasattr(value, '__iter__'): 1549 raise ValueError( 1550 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1551 % (self[0].name, value)) 1552 values.update(value)

1553 1554 @value.deleter

1555 - def value(self):

1556 self.value.clear()

1557 1558 @property

1559 - def value_options(self):

1560 """ 1561 Returns a list of all the possible values. 1562 """ 1563 return [el.get('value') for el in self]

1564

1565 - def __repr__(self):

1566 return '%s(%s)' % ( 1567 self.__class__.__name__, list.__repr__(self))

1568

1569 1570 -class CheckboxValues(SetMixin):

1571 """ 1572 Represents the values of the checked checkboxes in a group of 1573 checkboxes with the same name. 1574 """ 1575

1576 - def __init__(self, group):

1577 self.group = group

1578

1579 - def __iter__(self):

1580 return iter([ 1581 el.get('value') 1582 for el in self.group 1583 if 'checked' in el.attrib])

1584

1585 - def add(self, value):

1586 for el in self.group: 1587 if el.get('value') == value: 1588 el.set('checked', '') 1589 break 1590 else: 1591 raise KeyError("No checkbox with value %r" % value)

1592

1593 - def remove(self, value):

1594 for el in self.group: 1595 if el.get('value') == value: 1596 if 'checked' in el.attrib: 1597 del el.attrib['checked'] 1598 else: 1599 raise KeyError( 1600 "The checkbox with value %r was already unchecked" % value) 1601 break 1602 else: 1603 raise KeyError( 1604 "No checkbox with value %r" % value)

1605

1606 - def __repr__(self):

1607 return '<%s {%s} for checkboxes name=%r>' % ( 1608 self.__class__.__name__, 1609 ', '.join([repr(v) for v in self]), 1610 self.group.name)

1611

1612 1613 -class InputElement(InputMixin, HtmlElement):

1614 """ 1615 Represents an ``<input>`` element. 1616 1617 You can get the type with ``.type`` (which is lower-cased and 1618 defaults to ``'text'``). 1619 1620 Also you can get and set the value with ``.value`` 1621 1622 Checkboxes and radios have the attribute ``input.checkable == 1623 True`` (for all others it is false) and a boolean attribute 1624 ``.checked``. 1625 1626 """ 1627 1628 ## FIXME: I'm a little uncomfortable with the use of .checked 1629 @property

1630 - def value(self):

1631 """ 1632 Get/set the value of this element, using the ``value`` attribute. 1633 1634 Also, if this is a checkbox and it has no value, this defaults 1635 to ``'on'``. If it is a checkbox or radio that is not 1636 checked, this returns None. 1637 """ 1638 if self.checkable: 1639 if self.checked: 1640 return self.get('value') or 'on' 1641 else: 1642 return None 1643 return self.get('value')

1644 1645 @value.setter

1646 - def value(self, value):

1647 if self.checkable: 1648 if not value: 1649 self.checked = False 1650 else: 1651 self.checked = True 1652 if isinstance(value, basestring): 1653 self.set('value', value) 1654 else: 1655 self.set('value', value)

1656 1657 @value.deleter

1658 - def value(self):

1659 if self.checkable: 1660 self.checked = False 1661 else: 1662 if 'value' in self.attrib: 1663 del self.attrib['value']

1664 1665 @property

1666 - def type(self):

1667 """ 1668 Return the type of this element (using the type attribute). 1669 """ 1670 return self.get('type', 'text').lower()

1671 1672 @type.setter

1673 - def type(self, value):

1674 self.set('type', value)

1675 1676 @property

1677 - def checkable(self):

1678 """ 1679 Boolean: can this element be checked? 1680 """ 1681 return self.type in ('checkbox', 'radio')

1682 1683 @property

1684 - def checked(self):

1685 """ 1686 Boolean attribute to get/set the presence of the ``checked`` 1687 attribute. 1688 1689 You can only use this on checkable input types. 1690 """ 1691 if not self.checkable: 1692 raise AttributeError('Not a checkable input type') 1693 return 'checked' in self.attrib

1694 1695 @checked.setter

1696 - def checked(self, value):

1697 if not self.checkable: 1698 raise AttributeError('Not a checkable input type') 1699 if value: 1700 self.set('checked', '') 1701 else: 1702 attrib = self.attrib 1703 if 'checked' in attrib: 1704 del attrib['checked']

1705 1706 1707 HtmlElementClassLookup._default_element_classes['input'] = InputElement

1708 1709 1710 -class LabelElement(HtmlElement):

1711 """ 1712 Represents a ``<label>`` element. 1713 1714 Label elements are linked to other elements with their ``for`` 1715 attribute. You can access this element with ``label.for_element``. 1716 """ 1717 @property

1718 - def for_element(self):

1719 """ 1720 Get/set the element this label points to. Return None if it 1721 can't be found. 1722 """ 1723 id = self.get('for') 1724 if not id: 1725 return None 1726 return self.body.get_element_by_id(id)

1727 1728 @for_element.setter

1729 - def for_element(self, other):

1730 id = other.get('id') 1731 if not id: 1732 raise TypeError( 1733 "Element %r has no id attribute" % other) 1734 self.set('for', id)

1735 1736 @for_element.deleter

1737 - def for_element(self):

1738 attrib = self.attrib 1739 if 'id' in attrib: 1740 del attrib['id']

1741 1742 1743 HtmlElementClassLookup._default_element_classes['label'] = LabelElement

1744 1745 1746 ############################################################ 1747 ## Serialization 1748 ############################################################ 1749 1750 -def html_to_xhtml(html):

1751 """Convert all tags in an HTML tree to XHTML by moving them to the 1752 XHTML namespace. 1753 """ 1754 try: 1755 html = html.getroot() 1756 except AttributeError: 1757 pass 1758 prefix = "{%s}" % XHTML_NAMESPACE 1759 for el in html.iter(etree.Element): 1760 tag = el.tag 1761 if tag[0] != '{': 1762 el.tag = prefix + tag

1763

1764 1765 -def xhtml_to_html(xhtml):

1766 """Convert all tags in an XHTML tree to HTML by removing their 1767 XHTML namespace. 1768 """ 1769 try: 1770 xhtml = xhtml.getroot() 1771 except AttributeError: 1772 pass 1773 prefix = "{%s}" % XHTML_NAMESPACE 1774 prefix_len = len(prefix) 1775 for el in xhtml.iter(prefix + "*"): 1776 el.tag = el.tag[prefix_len:]

1777 1778 1779 # This isn't a general match, but it's a match for what libxml2 1780 # specifically serialises: 1781 __str_replace_meta_content_type = re.compile( 1782 r'<meta http-equiv="Content-Type"[^>]*>').sub 1783 __bytes_replace_meta_content_type = re.compile( 1784 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub

1785 1786 1787 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1788 encoding=None, method="html", with_tail=True, doctype=None):

1789 """Return an HTML string representation of the document. 1790 1791 Note: if include_meta_content_type is true this will create a 1792 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1793 regardless of the value of include_meta_content_type any existing 1794 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1795 1796 The ``encoding`` argument controls the output encoding (defauts to 1797 ASCII, with &#...; character references for any characters outside 1798 of ASCII). Note that you can pass the name ``'unicode'`` as 1799 ``encoding`` argument to serialise to a Unicode string. 1800 1801 The ``method`` argument defines the output method. It defaults to 1802 'html', but can also be 'xml' for xhtml output, or 'text' to 1803 serialise to plain text without markup. 1804 1805 To leave out the tail text of the top-level element that is being 1806 serialised, pass ``with_tail=False``. 1807 1808 The ``doctype`` option allows passing in a plain string that will 1809 be serialised before the XML tree. Note that passing in non 1810 well-formed content here will make the XML output non well-formed. 1811 Also, an existing doctype in the document tree will not be removed 1812 when serialising an ElementTree instance. 1813 1814 Example:: 1815 1816 >>> from lxml import html 1817 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1818 1819 >>> html.tostring(root) 1820 b'<p>Hello<br>world!</p>' 1821 >>> html.tostring(root, method='html') 1822 b'<p>Hello<br>world!</p>' 1823 1824 >>> html.tostring(root, method='xml') 1825 b'<p>Hello<br/>world!</p>' 1826 1827 >>> html.tostring(root, method='text') 1828 b'Helloworld!' 1829 1830 >>> html.tostring(root, method='text', encoding='unicode') 1831 u'Helloworld!' 1832 1833 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1834 >>> html.tostring(root[0], method='text', encoding='unicode') 1835 u'Helloworld!TAIL' 1836 1837 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1838 u'Helloworld!' 1839 1840 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1841 >>> html.tostring(doc, method='html', encoding='unicode') 1842 u'<html><body><p>Hello<br>world!</p></body></html>' 1843 1844 >>> print(html.tostring(doc, method='html', encoding='unicode', 1845 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1846 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1847 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1848 <html><body><p>Hello<br>world!</p></body></html> 1849 """ 1850 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1851 encoding=encoding, with_tail=with_tail, 1852 doctype=doctype) 1853 if method == 'html' and not include_meta_content_type: 1854 if isinstance(html, str): 1855 html = __str_replace_meta_content_type('', html) 1856 else: 1857 html = __bytes_replace_meta_content_type(bytes(), html) 1858 return html

1859 1860 1861 tostring.__doc__ = __fix_docstring(tostring.__doc__)

1862 1863 1864 -def open_in_browser(doc, encoding=None):

1865 """ 1866 Open the HTML document in a web browser, saving it to a temporary 1867 file to open it. Note that this does not delete the file after 1868 use. This is mainly meant for debugging. 1869 """ 1870 import os 1871 import webbrowser 1872 import tempfile 1873 if not isinstance(doc, etree._ElementTree): 1874 doc = etree.ElementTree(doc) 1875 handle, fn = tempfile.mkstemp(suffix='.html') 1876 f = os.fdopen(handle, 'wb') 1877 try: 1878 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1879 finally: 1880 # we leak the file itself here, but we should at least close it 1881 f.close() 1882 url = 'file://' + fn.replace(os.path.sep, '/') 1883 print(url) 1884 webbrowser.open(url)

1885

1886 1887 ################################################################################ 1888 # configure Element class lookup 1889 ################################################################################ 1890 1891 -class HTMLParser(etree.HTMLParser):

1892 """An HTML parser that is configured to return lxml.html Element 1893 objects. 1894 """

1895 - def __init__(self, **kwargs):

1896 super(HTMLParser, self).__init__(**kwargs) 1897 self.set_element_class_lookup(HtmlElementClassLookup())

1898

1899 1900 -class XHTMLParser(etree.XMLParser):

1901 """An XML parser that is configured to return lxml.html Element 1902 objects. 1903 1904 Note that this parser is not really XHTML aware unless you let it 1905 load a DTD that declares the HTML entities. To do this, make sure 1906 you have the XHTML DTDs installed in your catalogs, and create the 1907 parser like this:: 1908 1909 >>> parser = XHTMLParser(load_dtd=True) 1910 1911 If you additionally want to validate the document, use this:: 1912 1913 >>> parser = XHTMLParser(dtd_validation=True) 1914 1915 For catalog support, see http://www.xmlsoft.org/catalog.html. 1916 """

1917 - def __init__(self, **kwargs):

1918 super(XHTMLParser, self).__init__(**kwargs) 1919 self.set_element_class_lookup(HtmlElementClassLookup())

1920

1921 1922 -def Element(*args, **kw):

1923 """Create a new HTML Element. 1924 1925 This can also be used for XHTML documents. 1926 """ 1927 v = html_parser.makeelement(*args, **kw) 1928 return v

1929 1930 1931 html_parser = HTMLParser() 1932 xhtml_parser = XHTMLParser() 1933

Source Code for Package lxml.html