lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 from __future__ import absolute_import 35 36 __all__ = [ 37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 39 'find_rel_links', 'find_class', 'make_links_absolute', 40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 41 42 43 import copy 44 import sys 45 import re 46 from functools import partial 47 48 try: 49 from collections.abc import MutableMapping, MutableSet 50 except ImportError: 51 from collections import MutableMapping, MutableSet 52 53 from .. import etree 54 from . import defs 55 from ._setmixin import SetMixin 56 57 try: 58 from urlparse import urljoin 59 except ImportError: 60 # Python 3 61 from urllib.parse import urljoin 62 63 try: 64 unicode 65 except NameError: 66 # Python 3 67 unicode = str 68 try: 69 basestring 70 except NameError: 71 # Python 3 72 basestring = (str, bytes)

73 74 75 -def __fix_docstring(s):

76 if not s: 77 return s 78 if sys.version_info[0] >= 3: 79 sub = re.compile(r"^(\s*)u'", re.M).sub 80 else: 81 sub = re.compile(r"^(\s*)b'", re.M).sub 82 return sub(r"\1'", s)

83 84 85 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 86 87 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 88 namespaces={'x':XHTML_NAMESPACE}) 89 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 92 namespaces={'x':XHTML_NAMESPACE}) 93 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 94 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 95 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 96 _collect_string_content = etree.XPath("string()") 97 _iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer 98 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 99 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 100 namespaces={'x':XHTML_NAMESPACE}) 101 _archive_re = re.compile(r'[^ ]+') 102 _parse_meta_refresh_url = re.compile( 103 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search

104 105 106 -def _unquote_match(s, pos):

107 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 108 return s[1:-1], pos+1 109 else: 110 return s,pos

111

112 113 -def _transform_result(typ, result):

114 """Convert the result back into the input type. 115 """ 116 if issubclass(typ, bytes): 117 return tostring(result, encoding='utf-8') 118 elif issubclass(typ, unicode): 119 return tostring(result, encoding='unicode') 120 else: 121 return result

122

123 124 -def _nons(tag):

125 if isinstance(tag, basestring): 126 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 127 return tag.split('}')[-1] 128 return tag

129

130 131 -class Classes(MutableSet):

132 """Provides access to an element's class attribute as a set-like collection. 133 Usage:: 134 135 >>> el = fromstring('<p class="hidden large">Text</p>') 136 >>> classes = el.classes # or: classes = Classes(el.attrib) 137 >>> classes |= ['block', 'paragraph'] 138 >>> el.get('class') 139 'hidden large block paragraph' 140 >>> classes.toggle('hidden') 141 False 142 >>> el.get('class') 143 'large block paragraph' 144 >>> classes -= ('some', 'classes', 'block') 145 >>> el.get('class') 146 'large paragraph' 147 """

148 - def __init__(self, attributes):

149 self._attributes = attributes 150 self._get_class_value = partial(attributes.get, 'class', '')

151

152 - def add(self, value):

153 """ 154 Add a class. 155 156 This has no effect if the class is already present. 157 """ 158 if not value or re.search(r'\s', value): 159 raise ValueError("Invalid class name: %r" % value) 160 classes = self._get_class_value().split() 161 if value in classes: 162 return 163 classes.append(value) 164 self._attributes['class'] = ' '.join(classes)

165

166 - def discard(self, value):

167 """ 168 Remove a class if it is currently present. 169 170 If the class is not present, do nothing. 171 """ 172 if not value or re.search(r'\s', value): 173 raise ValueError("Invalid class name: %r" % value) 174 classes = [name for name in self._get_class_value().split() 175 if name != value] 176 if classes: 177 self._attributes['class'] = ' '.join(classes) 178 elif 'class' in self._attributes: 179 del self._attributes['class']

180

181 - def remove(self, value):

182 """ 183 Remove a class; it must currently be present. 184 185 If the class is not present, raise a KeyError. 186 """ 187 if not value or re.search(r'\s', value): 188 raise ValueError("Invalid class name: %r" % value) 189 super(Classes, self).remove(value)

190

191 - def __contains__(self, name):

192 classes = self._get_class_value() 193 return name in classes and name in classes.split()

194

195 - def __iter__(self):

196 return iter(self._get_class_value().split())

197

198 - def __len__(self):

199 return len(self._get_class_value().split())

200 201 # non-standard methods 202

203 - def update(self, values):

204 """ 205 Add all names from 'values'. 206 """ 207 classes = self._get_class_value().split() 208 extended = False 209 for value in values: 210 if value not in classes: 211 classes.append(value) 212 extended = True 213 if extended: 214 self._attributes['class'] = ' '.join(classes)

215

216 - def toggle(self, value):

217 """ 218 Add a class name if it isn't there yet, or remove it if it exists. 219 220 Returns true if the class was added (and is now enabled) and 221 false if it was removed (and is now disabled). 222 """ 223 if not value or re.search(r'\s', value): 224 raise ValueError("Invalid class name: %r" % value) 225 classes = self._get_class_value().split() 226 try: 227 classes.remove(value) 228 enabled = False 229 except ValueError: 230 classes.append(value) 231 enabled = True 232 if classes: 233 self._attributes['class'] = ' '.join(classes) 234 else: 235 del self._attributes['class'] 236 return enabled

237

238 239 -class HtmlMixin(object):

240

241 - def set(self, key, value=None):

242 """set(self, key, value=None) 243 244 Sets an element attribute. If no value is provided, or if the value is None, 245 creates a 'boolean' attribute without value, e.g. "<form novalidate></form>" 246 for ``form.set('novalidate')``. 247 """ 248 super(HtmlElement, self).set(key, value)

249 250 @property

251 - def classes(self):

252 """ 253 A set-like wrapper around the 'class' attribute. 254 """ 255 return Classes(self.attrib)

256 257 @classes.setter

258 - def classes(self, classes):

259 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 260 value = classes._get_class_value() 261 if value: 262 self.set('class', value) 263 elif self.get('class') is not None: 264 del self.attrib['class']

265 266 @property

267 - def base_url(self):

268 """ 269 Returns the base URL, given when the page was parsed. 270 271 Use with ``urlparse.urljoin(el.base_url, href)`` to get 272 absolute URLs. 273 """ 274 return self.getroottree().docinfo.URL

275 276 @property

277 - def forms(self):

278 """ 279 Return a list of all the forms 280 """ 281 return _forms_xpath(self)

282 283 @property

284 - def body(self):

285 """ 286 Return the <body> element. Can be called from a child element 287 to get the document's head. 288 """ 289 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

290 291 @property

292 - def head(self):

293 """ 294 Returns the <head> element. Can be called from a child 295 element to get the document's head. 296 """ 297 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

298 299 @property

300 - def label(self):

301 """ 302 Get or set any <label> element associated with this element. 303 """ 304 id = self.get('id') 305 if not id: 306 return None 307 result = _label_xpath(self, id=id) 308 if not result: 309 return None 310 else: 311 return result[0]

312 313 @label.setter

314 - def label(self, label):

315 id = self.get('id') 316 if not id: 317 raise TypeError( 318 "You cannot set a label for an element (%r) that has no id" 319 % self) 320 if _nons(label.tag) != 'label': 321 raise TypeError( 322 "You can only assign label to a label element (not %r)" 323 % label) 324 label.set('for', id)

325 326 @label.deleter

327 - def label(self):

328 label = self.label 329 if label is not None: 330 del label.attrib['for']

331

332 - def drop_tree(self):

333 """ 334 Removes this element from the tree, including its children and 335 text. The tail text is joined to the previous element or 336 parent. 337 """ 338 parent = self.getparent() 339 assert parent is not None 340 if self.tail: 341 previous = self.getprevious() 342 if previous is None: 343 parent.text = (parent.text or '') + self.tail 344 else: 345 previous.tail = (previous.tail or '') + self.tail 346 parent.remove(self)

347

348 - def drop_tag(self):

349 """ 350 Remove the tag, but not its children or text. The children and text 351 are merged into the parent. 352 353 Example:: 354 355 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 356 >>> h.find('.//b').drop_tag() 357 >>> print(tostring(h, encoding='unicode')) 358 <div>Hello World!</div> 359 """ 360 parent = self.getparent() 361 assert parent is not None 362 previous = self.getprevious() 363 if self.text and isinstance(self.tag, basestring): 364 # not a Comment, etc. 365 if previous is None: 366 parent.text = (parent.text or '') + self.text 367 else: 368 previous.tail = (previous.tail or '') + self.text 369 if self.tail: 370 if len(self): 371 last = self[-1] 372 last.tail = (last.tail or '') + self.tail 373 elif previous is None: 374 parent.text = (parent.text or '') + self.tail 375 else: 376 previous.tail = (previous.tail or '') + self.tail 377 index = parent.index(self) 378 parent[index:index+1] = self[:]

379

380 - def find_rel_links(self, rel):

381 """ 382 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 383 """ 384 rel = rel.lower() 385 return [el for el in _rel_links_xpath(self) 386 if el.get('rel').lower() == rel]

387

388 - def find_class(self, class_name):

389 """ 390 Find any elements with the given class name. 391 """ 392 return _class_xpath(self, class_name=class_name)

393

394 - def get_element_by_id(self, id, *default):

395 """ 396 Get the first element in a document with the given id. If none is 397 found, return the default argument if provided or raise KeyError 398 otherwise. 399 400 Note that there can be more than one element with the same id, 401 and this isn't uncommon in HTML documents found in the wild. 402 Browsers return only the first match, and this function does 403 the same. 404 """ 405 try: 406 # FIXME: should this check for multiple matches? 407 # browsers just return the first one 408 return _id_xpath(self, id=id)[0] 409 except IndexError: 410 if default: 411 return default[0] 412 else: 413 raise KeyError(id)

414

415 - def text_content(self):

416 """ 417 Return the text content of the tag (and the text in any children). 418 """ 419 return _collect_string_content(self)

420

421 - def cssselect(self, expr, translator='html'):

422 """ 423 Run the CSS expression on this element and its children, 424 returning a list of the results. 425 426 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 427 -- note that pre-compiling the expression can provide a substantial 428 speedup. 429 """ 430 # Do the import here to make the dependency optional. 431 from lxml.cssselect import CSSSelector 432 return CSSSelector(expr, translator=translator)(self)

433 434 ######################################## 435 ## Link functions 436 ######################################## 437

438 - def make_links_absolute(self, base_url=None, resolve_base_href=True, 439 handle_failures=None):

440 """ 441 Make all links in the document absolute, given the 442 ``base_url`` for the document (the full URL where the document 443 came from), or if no ``base_url`` is given, then the ``.base_url`` 444 of the document. 445 446 If ``resolve_base_href`` is true, then any ``<base href>`` 447 tags in the document are used *and* removed from the document. 448 If it is false then any such tag is ignored. 449 450 If ``handle_failures`` is None (default), a failure to process 451 a URL will abort the processing. If set to 'ignore', errors 452 are ignored. If set to 'discard', failing URLs will be removed. 453 """ 454 if base_url is None: 455 base_url = self.base_url 456 if base_url is None: 457 raise TypeError( 458 "No base_url given, and the document has no base_url") 459 if resolve_base_href: 460 self.resolve_base_href() 461 462 if handle_failures == 'ignore': 463 def link_repl(href): 464 try: 465 return urljoin(base_url, href) 466 except ValueError: 467 return href

468 elif handle_failures == 'discard': 469 def link_repl(href): 470 try: 471 return urljoin(base_url, href) 472 except ValueError: 473 return None

474 elif handle_failures is None: 475 def link_repl(href): 476 return urljoin(base_url, href) 477 else: 478 raise ValueError( 479 "unexpected value for handle_failures: %r" % handle_failures) 480 481 self.rewrite_links(link_repl) 482

483 - def resolve_base_href(self, handle_failures=None):

484 """ 485 Find any ``<base href>`` tag in the document, and apply its 486 values to all links found in the document. Also remove the 487 tag once it has been applied. 488 489 If ``handle_failures`` is None (default), a failure to process 490 a URL will abort the processing. If set to 'ignore', errors 491 are ignored. If set to 'discard', failing URLs will be removed. 492 """ 493 base_href = None 494 basetags = self.xpath('//base[@href]|//x:base[@href]', 495 namespaces={'x': XHTML_NAMESPACE}) 496 for b in basetags: 497 base_href = b.get('href') 498 b.drop_tree() 499 if not base_href: 500 return 501 self.make_links_absolute(base_href, resolve_base_href=False, 502 handle_failures=handle_failures)

503

504 - def iterlinks(self):

505 """ 506 Yield (element, attribute, link, pos), where attribute may be None 507 (indicating the link is in the text). ``pos`` is the position 508 where the link occurs; often 0, but sometimes something else in 509 the case of links in stylesheets or style tags. 510 511 Note: <base href> is *not* taken into account in any way. The 512 link you get is exactly the link in the document. 513 514 Note: multiple links inside of a single text string or 515 attribute value are returned in reversed order. This makes it 516 possible to replace or delete them from the text string value 517 based on their reported text positions. Otherwise, a 518 modification at one text position can change the positions of 519 links reported later on. 520 """ 521 link_attrs = defs.link_attrs 522 for el in self.iter(etree.Element): 523 attribs = el.attrib 524 tag = _nons(el.tag) 525 if tag == 'object': 526 codebase = None 527 ## <object> tags have attributes that are relative to 528 ## codebase 529 if 'codebase' in attribs: 530 codebase = el.get('codebase') 531 yield (el, 'codebase', codebase, 0) 532 for attrib in ('classid', 'data'): 533 if attrib in attribs: 534 value = el.get(attrib) 535 if codebase is not None: 536 value = urljoin(codebase, value) 537 yield (el, attrib, value, 0) 538 if 'archive' in attribs: 539 for match in _archive_re.finditer(el.get('archive')): 540 value = match.group(0) 541 if codebase is not None: 542 value = urljoin(codebase, value) 543 yield (el, 'archive', value, match.start()) 544 else: 545 for attrib in link_attrs: 546 if attrib in attribs: 547 yield (el, attrib, attribs[attrib], 0) 548 if tag == 'meta': 549 http_equiv = attribs.get('http-equiv', '').lower() 550 if http_equiv == 'refresh': 551 content = attribs.get('content', '') 552 match = _parse_meta_refresh_url(content) 553 url = (match.group('url') if match else content).strip() 554 # unexpected content means the redirect won't work, but we might 555 # as well be permissive and return the entire string. 556 if url: 557 url, pos = _unquote_match( 558 url, match.start('url') if match else content.find(url)) 559 yield (el, 'content', url, pos) 560 elif tag == 'param': 561 valuetype = el.get('valuetype') or '' 562 if valuetype.lower() == 'ref': 563 ## FIXME: while it's fine we *find* this link, 564 ## according to the spec we aren't supposed to 565 ## actually change the value, including resolving 566 ## it. It can also still be a link, even if it 567 ## doesn't have a valuetype="ref" (which seems to be the norm) 568 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 569 yield (el, 'value', el.get('value'), 0) 570 elif tag == 'style' and el.text: 571 urls = [ 572 # (start_pos, url) 573 _unquote_match(match.group(1), match.start(1))[::-1] 574 for match in _iter_css_urls(el.text) 575 ] + [ 576 (match.start(1), match.group(1)) 577 for match in _iter_css_imports(el.text) 578 ] 579 if urls: 580 # sort by start pos to bring both match sets back into order 581 # and reverse the list to report correct positions despite 582 # modifications 583 urls.sort(reverse=True) 584 for start, url in urls: 585 yield (el, None, url, start) 586 if 'style' in attribs: 587 urls = list(_iter_css_urls(attribs['style'])) 588 if urls: 589 # return in reversed order to simplify in-place modifications 590 for match in urls[::-1]: 591 url, start = _unquote_match(match.group(1), match.start(1)) 592 yield (el, 'style', url, start)

593

594 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 595 base_href=None):

596 """ 597 Rewrite all the links in the document. For each link 598 ``link_repl_func(link)`` will be called, and the return value 599 will replace the old link. 600 601 Note that links may not be absolute (unless you first called 602 ``make_links_absolute()``), and may be internal (e.g., 603 ``'#anchor'``). They can also be values like 604 ``'mailto:email'`` or ``'javascript:expr'``. 605 606 If you give ``base_href`` then all links passed to 607 ``link_repl_func()`` will take that into account. 608 609 If the ``link_repl_func`` returns None, the attribute or 610 tag text will be removed completely. 611 """ 612 if base_href is not None: 613 # FIXME: this can be done in one pass with a wrapper 614 # around link_repl_func 615 self.make_links_absolute( 616 base_href, resolve_base_href=resolve_base_href) 617 elif resolve_base_href: 618 self.resolve_base_href() 619 620 for el, attrib, link, pos in self.iterlinks(): 621 new_link = link_repl_func(link.strip()) 622 if new_link == link: 623 continue 624 if new_link is None: 625 # Remove the attribute or element content 626 if attrib is None: 627 el.text = '' 628 else: 629 del el.attrib[attrib] 630 continue 631 632 if attrib is None: 633 new = el.text[:pos] + new_link + el.text[pos+len(link):] 634 el.text = new 635 else: 636 cur = el.get(attrib) 637 if not pos and len(cur) == len(link): 638 new = new_link # most common case 639 else: 640 new = cur[:pos] + new_link + cur[pos+len(link):] 641 el.set(attrib, new)

642

643 644 -class _MethodFunc(object):

645 """ 646 An object that represents a method on an element as a function; 647 the function takes either an element or an HTML string. It 648 returns whatever the function normally returns, or if the function 649 works in-place (and so returns None) it returns a serialized form 650 of the resulting document. 651 """

652 - def __init__(self, name, copy=False, source_class=HtmlMixin):

653 self.name = name 654 self.copy = copy 655 self.__doc__ = getattr(source_class, self.name).__doc__

656 - def __call__(self, doc, *args, **kw):

657 result_type = type(doc) 658 if isinstance(doc, basestring): 659 if 'copy' in kw: 660 raise TypeError( 661 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 662 doc = fromstring(doc, **kw) 663 else: 664 if 'copy' in kw: 665 make_a_copy = kw.pop('copy') 666 else: 667 make_a_copy = self.copy 668 if make_a_copy: 669 doc = copy.deepcopy(doc) 670 meth = getattr(doc, self.name) 671 result = meth(*args, **kw) 672 # FIXME: this None test is a bit sloppy 673 if result is None: 674 # Then return what we got in 675 return _transform_result(result_type, doc) 676 else: 677 return result

678 679 680 find_rel_links = _MethodFunc('find_rel_links', copy=False) 681 find_class = _MethodFunc('find_class', copy=False) 682 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 683 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 684 iterlinks = _MethodFunc('iterlinks', copy=False) 685 rewrite_links = _MethodFunc('rewrite_links', copy=True)

686 687 688 -class HtmlComment(etree.CommentBase, HtmlMixin):

689 pass

690

691 692 -class HtmlElement(etree.ElementBase, HtmlMixin):

693 # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?) 694 cssselect = HtmlMixin.cssselect 695 set = HtmlMixin.set

696

697 698 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

699 pass

700

701 702 -class HtmlEntity(etree.EntityBase, HtmlMixin):

703 pass

704

705 706 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

707 """A lookup scheme for HTML Element classes. 708 709 To create a lookup instance with different Element classes, pass a tag 710 name mapping of Element classes in the ``classes`` keyword argument and/or 711 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 712 The special key '*' denotes a Mixin class that should be mixed into all 713 Element classes. 714 """ 715 _default_element_classes = {} 716

717 - def __init__(self, classes=None, mixins=None):

718 etree.CustomElementClassLookup.__init__(self) 719 if classes is None: 720 classes = self._default_element_classes.copy() 721 if mixins: 722 mixers = {} 723 for name, value in mixins: 724 if name == '*': 725 for n in classes.keys(): 726 mixers.setdefault(n, []).append(value) 727 else: 728 mixers.setdefault(name, []).append(value) 729 for name, mix_bases in mixers.items(): 730 cur = classes.get(name, HtmlElement) 731 bases = tuple(mix_bases + [cur]) 732 classes[name] = type(cur.__name__, bases, {}) 733 self._element_classes = classes

734

735 - def lookup(self, node_type, document, namespace, name):

736 if node_type == 'element': 737 return self._element_classes.get(name.lower(), HtmlElement) 738 elif node_type == 'comment': 739 return HtmlComment 740 elif node_type == 'PI': 741 return HtmlProcessingInstruction 742 elif node_type == 'entity': 743 return HtmlEntity 744 # Otherwise normal lookup 745 return None

746 747 748 ################################################################################ 749 # parsing 750 ################################################################################ 751 752 _looks_like_full_html_unicode = re.compile( 753 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 754 _looks_like_full_html_bytes = re.compile( 755 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match

756 757 758 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

759 if parser is None: 760 parser = html_parser 761 value = etree.fromstring(html, parser, **kw) 762 if value is None: 763 raise etree.ParserError( 764 "Document is empty") 765 if ensure_head_body and value.find('head') is None: 766 value.insert(0, Element('head')) 767 if ensure_head_body and value.find('body') is None: 768 value.append(Element('body')) 769 return value

770

771 772 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 773 parser=None, **kw):

774 """Parses several HTML elements, returning a list of elements. 775 776 The first item in the list may be a string. 777 If no_leading_text is true, then it will be an error if there is 778 leading text, and it will always be a list of only elements. 779 780 base_url will set the document's base_url attribute 781 (and the tree's docinfo.URL). 782 """ 783 if parser is None: 784 parser = html_parser 785 # FIXME: check what happens when you give html with a body, head, etc. 786 if isinstance(html, bytes): 787 if not _looks_like_full_html_bytes(html): 788 # can't use %-formatting in early Py3 versions 789 html = ('<html><body>'.encode('ascii') + html + 790 '</body></html>'.encode('ascii')) 791 else: 792 if not _looks_like_full_html_unicode(html): 793 html = '<html><body>%s</body></html>' % html 794 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 795 assert _nons(doc.tag) == 'html' 796 bodies = [e for e in doc if _nons(e.tag) == 'body'] 797 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 798 body = bodies[0] 799 elements = [] 800 if no_leading_text and body.text and body.text.strip(): 801 raise etree.ParserError( 802 "There is leading text: %r" % body.text) 803 if body.text and body.text.strip(): 804 elements.append(body.text) 805 elements.extend(body) 806 # FIXME: removing the reference to the parent artificial document 807 # would be nice 808 return elements

809

810 811 -def fragment_fromstring(html, create_parent=False, base_url=None, 812 parser=None, **kw):

813 """ 814 Parses a single HTML element; it is an error if there is more than 815 one element, or if anything but whitespace precedes or follows the 816 element. 817 818 If ``create_parent`` is true (or is a tag name) then a parent node 819 will be created to encapsulate the HTML in a single element. In this 820 case, leading or trailing text is also allowed, as are multiple elements 821 as result of the parsing. 822 823 Passing a ``base_url`` will set the document's ``base_url`` attribute 824 (and the tree's docinfo.URL). 825 """ 826 if parser is None: 827 parser = html_parser 828 829 accept_leading_text = bool(create_parent) 830 831 elements = fragments_fromstring( 832 html, parser=parser, no_leading_text=not accept_leading_text, 833 base_url=base_url, **kw) 834 835 if create_parent: 836 if not isinstance(create_parent, basestring): 837 create_parent = 'div' 838 new_root = Element(create_parent) 839 if elements: 840 if isinstance(elements[0], basestring): 841 new_root.text = elements[0] 842 del elements[0] 843 new_root.extend(elements) 844 return new_root 845 846 if not elements: 847 raise etree.ParserError('No elements found') 848 if len(elements) > 1: 849 raise etree.ParserError( 850 "Multiple elements found (%s)" 851 % ', '.join([_element_name(e) for e in elements])) 852 el = elements[0] 853 if el.tail and el.tail.strip(): 854 raise etree.ParserError( 855 "Element followed by text: %r" % el.tail) 856 el.tail = None 857 return el

858

859 860 -def fromstring(html, base_url=None, parser=None, **kw):

861 """ 862 Parse the html, returning a single element/document. 863 864 This tries to minimally parse the chunk of text, without knowing if it 865 is a fragment or a document. 866 867 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 868 """ 869 if parser is None: 870 parser = html_parser 871 if isinstance(html, bytes): 872 is_full_html = _looks_like_full_html_bytes(html) 873 else: 874 is_full_html = _looks_like_full_html_unicode(html) 875 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 876 if is_full_html: 877 return doc 878 # otherwise, lets parse it out... 879 bodies = doc.findall('body') 880 if not bodies: 881 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 882 if bodies: 883 body = bodies[0] 884 if len(bodies) > 1: 885 # Somehow there are multiple bodies, which is bad, but just 886 # smash them into one body 887 for other_body in bodies[1:]: 888 if other_body.text: 889 if len(body): 890 body[-1].tail = (body[-1].tail or '') + other_body.text 891 else: 892 body.text = (body.text or '') + other_body.text 893 body.extend(other_body) 894 # We'll ignore tail 895 # I guess we are ignoring attributes too 896 other_body.drop_tree() 897 else: 898 body = None 899 heads = doc.findall('head') 900 if not heads: 901 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 902 if heads: 903 # Well, we have some sort of structure, so lets keep it all 904 head = heads[0] 905 if len(heads) > 1: 906 for other_head in heads[1:]: 907 head.extend(other_head) 908 # We don't care about text or tail in a head 909 other_head.drop_tree() 910 return doc 911 if body is None: 912 return doc 913 if (len(body) == 1 and (not body.text or not body.text.strip()) 914 and (not body[-1].tail or not body[-1].tail.strip())): 915 # The body has just one element, so it was probably a single 916 # element passed in 917 return body[0] 918 # Now we have a body which represents a bunch of tags which have the 919 # content that was passed in. We will create a fake container, which 920 # is the body tag, except <body> implies too much structure. 921 if _contains_block_level_tag(body): 922 body.tag = 'div' 923 else: 924 body.tag = 'span' 925 return body

926

927 928 -def parse(filename_or_url, parser=None, base_url=None, **kw):

929 """ 930 Parse a filename, URL, or file-like object into an HTML document 931 tree. Note: this returns a tree, not an element. Use 932 ``parse(...).getroot()`` to get the document root. 933 934 You can override the base URL with the ``base_url`` keyword. This 935 is most useful when parsing from a file-like object. 936 """ 937 if parser is None: 938 parser = html_parser 939 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

940

941 942 -def _contains_block_level_tag(el):

943 # FIXME: I could do this with XPath, but would that just be 944 # unnecessarily slow? 945 for el in el.iter(etree.Element): 946 if _nons(el.tag) in defs.block_tags: 947 return True 948 return False

949

950 951 -def _element_name(el):

952 if isinstance(el, etree.CommentBase): 953 return 'comment' 954 elif isinstance(el, basestring): 955 return 'string' 956 else: 957 return _nons(el.tag)

958

959 960 ################################################################################ 961 # form handling 962 ################################################################################ 963 964 -class FormElement(HtmlElement):

965 """ 966 Represents a <form> element. 967 """ 968 969 @property

970 - def inputs(self):

971 """ 972 Returns an accessor for all the input elements in the form. 973 974 See `InputGetter` for more information about the object. 975 """ 976 return InputGetter(self)

977 978 @property

979 - def fields(self):

980 """ 981 Dictionary-like object that represents all the fields in this 982 form. You can set values in this dictionary to effect the 983 form. 984 """ 985 return FieldsDict(self.inputs)

986 987 @fields.setter

988 - def fields(self, value):

989 fields = self.fields 990 prev_keys = fields.keys() 991 for key, value in value.items(): 992 if key in prev_keys: 993 prev_keys.remove(key) 994 fields[key] = value 995 for key in prev_keys: 996 if key is None: 997 # Case of an unnamed input; these aren't really 998 # expressed in form_values() anyway. 999 continue 1000 fields[key] = None

1001

1002 - def _name(self):

1003 if self.get('name'): 1004 return self.get('name') 1005 elif self.get('id'): 1006 return '#' + self.get('id') 1007 iter_tags = self.body.iter 1008 forms = list(iter_tags('form')) 1009 if not forms: 1010 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 1011 return str(forms.index(self))

1012

1013 - def form_values(self):

1014 """ 1015 Return a list of tuples of the field values for the form. 1016 This is suitable to be passed to ``urllib.urlencode()``. 1017 """ 1018 results = [] 1019 for el in self.inputs: 1020 name = el.name 1021 if not name or 'disabled' in el.attrib: 1022 continue 1023 tag = _nons(el.tag) 1024 if tag == 'textarea': 1025 results.append((name, el.value)) 1026 elif tag == 'select': 1027 value = el.value 1028 if el.multiple: 1029 for v in value: 1030 results.append((name, v)) 1031 elif value is not None: 1032 results.append((name, el.value)) 1033 else: 1034 assert tag == 'input', ( 1035 "Unexpected tag: %r" % el) 1036 if el.checkable and not el.checked: 1037 continue 1038 if el.type in ('submit', 'image', 'reset', 'file'): 1039 continue 1040 value = el.value 1041 if value is not None: 1042 results.append((name, el.value)) 1043 return results

1044 1045 @property

1046 - def action(self):

1047 """ 1048 Get/set the form's ``action`` attribute. 1049 """ 1050 base_url = self.base_url 1051 action = self.get('action') 1052 if base_url and action is not None: 1053 return urljoin(base_url, action) 1054 else: 1055 return action

1056 1057 @action.setter

1058 - def action(self, value):

1059 self.set('action', value)

1060 1061 @action.deleter

1062 - def action(self):

1063 attrib = self.attrib 1064 if 'action' in attrib: 1065 del attrib['action']

1066 1067 @property

1068 - def method(self):

1069 """ 1070 Get/set the form's method. Always returns a capitalized 1071 string, and defaults to ``'GET'`` 1072 """ 1073 return self.get('method', 'GET').upper()

1074 1075 @method.setter

1076 - def method(self, value):

1077 self.set('method', value.upper())

1078 1079 1080 HtmlElementClassLookup._default_element_classes['form'] = FormElement

1081 1082 1083 -def submit_form(form, extra_values=None, open_http=None):

1084 """ 1085 Helper function to submit a form. Returns a file-like object, as from 1086 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 1087 which shows the URL if there were any redirects. 1088 1089 You can use this like:: 1090 1091 form = doc.forms[0] 1092 form.inputs['foo'].value = 'bar' # etc 1093 response = form.submit() 1094 doc = parse(response) 1095 doc.make_links_absolute(response.geturl()) 1096 1097 To change the HTTP requester, pass a function as ``open_http`` keyword 1098 argument that opens the URL for you. The function must have the following 1099 signature:: 1100 1101 open_http(method, URL, values) 1102 1103 The action is one of 'GET' or 'POST', the URL is the target URL as a 1104 string, and the values are a sequence of ``(name, value)`` tuples with the 1105 form data. 1106 """ 1107 values = form.form_values() 1108 if extra_values: 1109 if hasattr(extra_values, 'items'): 1110 extra_values = extra_values.items() 1111 values.extend(extra_values) 1112 if open_http is None: 1113 open_http = open_http_urllib 1114 if form.action: 1115 url = form.action 1116 else: 1117 url = form.base_url 1118 return open_http(form.method, url, values)

1119

1120 1121 -def open_http_urllib(method, url, values):

1122 if not url: 1123 raise ValueError("cannot submit, no URL provided") 1124 ## FIXME: should test that it's not a relative URL or something 1125 try: 1126 from urllib import urlencode, urlopen 1127 except ImportError: # Python 3 1128 from urllib.request import urlopen 1129 from urllib.parse import urlencode 1130 if method == 'GET': 1131 if '?' in url: 1132 url += '&' 1133 else: 1134 url += '?' 1135 url += urlencode(values) 1136 data = None 1137 else: 1138 data = urlencode(values) 1139 if not isinstance(data, bytes): 1140 data = data.encode('ASCII') 1141 return urlopen(url, data)

1142

1143 1144 -class FieldsDict(MutableMapping):

1145

1146 - def __init__(self, inputs):

1147 self.inputs = inputs

1148 - def __getitem__(self, item):

1149 return self.inputs[item].value

1150 - def __setitem__(self, item, value):

1151 self.inputs[item].value = value

1152 - def __delitem__(self, item):

1153 raise KeyError( 1154 "You cannot remove keys from ElementDict")

1155 - def keys(self):

1156 return self.inputs.keys()

1157 - def __contains__(self, item):

1158 return item in self.inputs

1159 - def __iter__(self):

1160 return iter(self.inputs.keys())

1161 - def __len__(self):

1162 return len(self.inputs)

1163

1164 - def __repr__(self):

1165 return '<%s for form %s>' % ( 1166 self.__class__.__name__, 1167 self.inputs.form._name())

1168

1169 1170 -class InputGetter(object):

1171 1172 """ 1173 An accessor that represents all the input fields in a form. 1174 1175 You can get fields by name from this, with 1176 ``form.inputs['field_name']``. If there are a set of checkboxes 1177 with the same name, they are returned as a list (a `CheckboxGroup` 1178 which also allows value setting). Radio inputs are handled 1179 similarly. 1180 1181 You can also iterate over this to get all input elements. This 1182 won't return the same thing as if you get all the names, as 1183 checkboxes and radio elements are returned individually. 1184 """ 1185 1186 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1187 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1188

1189 - def __init__(self, form):

1190 self.form = form

1191

1192 - def __repr__(self):

1193 return '<%s for form %s>' % ( 1194 self.__class__.__name__, 1195 self.form._name())

1196 1197 ## FIXME: there should be more methods, and it's unclear if this is 1198 ## a dictionary-like object or list-like object 1199

1200 - def __getitem__(self, name):

1201 results = self._name_xpath(self.form, name=name) 1202 if results: 1203 type = results[0].get('type') 1204 if type == 'radio' and len(results) > 1: 1205 group = RadioGroup(results) 1206 group.name = name 1207 return group 1208 elif type == 'checkbox' and len(results) > 1: 1209 group = CheckboxGroup(results) 1210 group.name = name 1211 return group 1212 else: 1213 # I don't like throwing away elements like this 1214 return results[0] 1215 else: 1216 raise KeyError( 1217 "No input element with the name %r" % name)

1218

1219 - def __contains__(self, name):

1220 results = self._name_xpath(self.form, name=name) 1221 return bool(results)

1222

1223 - def keys(self):

1224 names = set() 1225 for el in self: 1226 names.add(el.name) 1227 if None in names: 1228 names.remove(None) 1229 return list(names)

1230

1231 - def __iter__(self):

1232 ## FIXME: kind of dumb to turn a list into an iterator, only 1233 ## to have it likely turned back into a list again :( 1234 return iter(self._all_xpath(self.form))

1235

1236 1237 -class InputMixin(object):

1238 """ 1239 Mix-in for all input elements (input, select, and textarea) 1240 """ 1241 @property

1242 - def name(self):

1243 """ 1244 Get/set the name of the element 1245 """ 1246 return self.get('name')

1247 1248 @name.setter

1249 - def name(self, value):

1250 self.set('name', value)

1251 1252 @name.deleter

1253 - def name(self):

1254 attrib = self.attrib 1255 if 'name' in attrib: 1256 del attrib['name']

1257

1258 - def __repr__(self):

1259 type_name = getattr(self, 'type', None) 1260 if type_name: 1261 type_name = ' type=%r' % type_name 1262 else: 1263 type_name = '' 1264 return '<%s %x name=%r%s>' % ( 1265 self.__class__.__name__, id(self), self.name, type_name)

1266

1267 1268 -class TextareaElement(InputMixin, HtmlElement):

1269 """ 1270 ``<textarea>`` element. You can get the name with ``.name`` and 1271 get/set the value with ``.value`` 1272 """ 1273 @property

1274 - def value(self):

1275 """ 1276 Get/set the value (which is the contents of this element) 1277 """ 1278 content = self.text or '' 1279 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1280 serialisation_method = 'xml' 1281 else: 1282 serialisation_method = 'html' 1283 for el in self: 1284 # it's rare that we actually get here, so let's not use ''.join() 1285 content += etree.tostring( 1286 el, method=serialisation_method, encoding='unicode') 1287 return content

1288 1289 @value.setter

1290 - def value(self, value):

1291 del self[:] 1292 self.text = value

1293 1294 @value.deleter

1295 - def value(self):

1296 self.text = '' 1297 del self[:]

1298 1299 1300 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement

1301 1302 1303 -class SelectElement(InputMixin, HtmlElement):

1304 """ 1305 ``<select>`` element. You can get the name with ``.name``. 1306 1307 ``.value`` will be the value of the selected option, unless this 1308 is a multi-select element (``<select multiple>``), in which case 1309 it will be a set-like object. In either case ``.value_options`` 1310 gives the possible values. 1311 1312 The boolean attribute ``.multiple`` shows if this is a 1313 multi-select. 1314 """ 1315 @property

1316 - def value(self):

1317 """ 1318 Get/set the value of this select (the selected option). 1319 1320 If this is a multi-select, this is a set-like object that 1321 represents all the selected options. 1322 """ 1323 if self.multiple: 1324 return MultipleSelectOptions(self) 1325 options = _options_xpath(self) 1326 1327 try: 1328 selected_option = next(el for el in reversed(options) if el.get('selected') is not None) 1329 except StopIteration: 1330 try: 1331 selected_option = next(el for el in options if el.get('disabled') is None) 1332 except StopIteration: 1333 return None 1334 value = selected_option.get('value') 1335 if value is None: 1336 value = (selected_option.text or '').strip() 1337 return value

1338 1339 @value.setter

1340 - def value(self, value):

1341 if self.multiple: 1342 if isinstance(value, basestring): 1343 raise TypeError("You must pass in a sequence") 1344 values = self.value 1345 values.clear() 1346 values.update(value) 1347 return 1348 checked_option = None 1349 if value is not None: 1350 for el in _options_xpath(self): 1351 opt_value = el.get('value') 1352 if opt_value is None: 1353 opt_value = (el.text or '').strip() 1354 if opt_value == value: 1355 checked_option = el 1356 break 1357 else: 1358 raise ValueError( 1359 "There is no option with the value of %r" % value) 1360 for el in _options_xpath(self): 1361 if 'selected' in el.attrib: 1362 del el.attrib['selected'] 1363 if checked_option is not None: 1364 checked_option.set('selected', '')

1365 1366 @value.deleter

1367 - def value(self):

1368 # FIXME: should del be allowed at all? 1369 if self.multiple: 1370 self.value.clear() 1371 else: 1372 self.value = None

1373 1374 @property

1375 - def value_options(self):

1376 """ 1377 All the possible values this select can have (the ``value`` 1378 attribute of all the ``<option>`` elements. 1379 """ 1380 options = [] 1381 for el in _options_xpath(self): 1382 value = el.get('value') 1383 if value is None: 1384 value = (el.text or '').strip() 1385 options.append(value) 1386 return options

1387 1388 @property

1389 - def multiple(self):

1390 """ 1391 Boolean attribute: is there a ``multiple`` attribute on this element. 1392 """ 1393 return 'multiple' in self.attrib

1394 1395 @multiple.setter

1396 - def multiple(self, value):

1397 if value: 1398 self.set('multiple', '') 1399 elif 'multiple' in self.attrib: 1400 del self.attrib['multiple']

1401 1402 1403 HtmlElementClassLookup._default_element_classes['select'] = SelectElement

1404 1405 1406 -class MultipleSelectOptions(SetMixin):

1407 """ 1408 Represents all the selected options in a ``<select multiple>`` element. 1409 1410 You can add to this set-like option to select an option, or remove 1411 to unselect the option. 1412 """ 1413

1414 - def __init__(self, select):

1415 self.select = select

1416 1417 @property

1418 - def options(self):

1419 """ 1420 Iterator of all the ``<option>`` elements. 1421 """ 1422 return iter(_options_xpath(self.select))

1423

1424 - def __iter__(self):

1425 for option in self.options: 1426 if 'selected' in option.attrib: 1427 opt_value = option.get('value') 1428 if opt_value is None: 1429 opt_value = (option.text or '').strip() 1430 yield opt_value

1431

1432 - def add(self, item):

1433 for option in self.options: 1434 opt_value = option.get('value') 1435 if opt_value is None: 1436 opt_value = (option.text or '').strip() 1437 if opt_value == item: 1438 option.set('selected', '') 1439 break 1440 else: 1441 raise ValueError( 1442 "There is no option with the value %r" % item)

1443

1444 - def remove(self, item):

1445 for option in self.options: 1446 opt_value = option.get('value') 1447 if opt_value is None: 1448 opt_value = (option.text or '').strip() 1449 if opt_value == item: 1450 if 'selected' in option.attrib: 1451 del option.attrib['selected'] 1452 else: 1453 raise ValueError( 1454 "The option %r is not currently selected" % item) 1455 break 1456 else: 1457 raise ValueError( 1458 "There is not option with the value %r" % item)

1459

1460 - def __repr__(self):

1461 return '<%s {%s} for select name=%r>' % ( 1462 self.__class__.__name__, 1463 ', '.join([repr(v) for v in self]), 1464 self.select.name)

1465

1466 1467 -class RadioGroup(list):

1468 """ 1469 This object represents several ``<input type=radio>`` elements 1470 that have the same name. 1471 1472 You can use this like a list, but also use the property 1473 ``.value`` to check/uncheck inputs. Also you can use 1474 ``.value_options`` to get the possible values. 1475 """ 1476 @property

1477 - def value(self):

1478 """ 1479 Get/set the value, which checks the radio with that value (and 1480 unchecks any other value). 1481 """ 1482 for el in self: 1483 if 'checked' in el.attrib: 1484 return el.get('value') 1485 return None

1486 1487 @value.setter

1488 - def value(self, value):

1489 checked_option = None 1490 if value is not None: 1491 for el in self: 1492 if el.get('value') == value: 1493 checked_option = el 1494 break 1495 else: 1496 raise ValueError("There is no radio input with the value %r" % value) 1497 for el in self: 1498 if 'checked' in el.attrib: 1499 del el.attrib['checked'] 1500 if checked_option is not None: 1501 checked_option.set('checked', '')

1502 1503 @value.deleter

1504 - def value(self):

1505 self.value = None

1506 1507 @property

1508 - def value_options(self):

1509 """ 1510 Returns a list of all the possible values. 1511 """ 1512 return [el.get('value') for el in self]

1513

1514 - def __repr__(self):

1515 return '%s(%s)' % ( 1516 self.__class__.__name__, 1517 list.__repr__(self))

1518

1519 1520 -class CheckboxGroup(list):

1521 """ 1522 Represents a group of checkboxes (``<input type=checkbox>``) that 1523 have the same name. 1524 1525 In addition to using this like a list, the ``.value`` attribute 1526 returns a set-like object that you can add to or remove from to 1527 check and uncheck checkboxes. You can also use ``.value_options`` 1528 to get the possible values. 1529 """ 1530 @property

1531 - def value(self):

1532 """ 1533 Return a set-like object that can be modified to check or 1534 uncheck individual checkboxes according to their value. 1535 """ 1536 return CheckboxValues(self)

1537 1538 @value.setter

1539 - def value(self, value):

1540 values = self.value 1541 values.clear() 1542 if not hasattr(value, '__iter__'): 1543 raise ValueError( 1544 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1545 % (self[0].name, value)) 1546 values.update(value)

1547 1548 @value.deleter

1549 - def value(self):

1550 self.value.clear()

1551 1552 @property

1553 - def value_options(self):

1554 """ 1555 Returns a list of all the possible values. 1556 """ 1557 return [el.get('value') for el in self]

1558

1559 - def __repr__(self):

1560 return '%s(%s)' % ( 1561 self.__class__.__name__, list.__repr__(self))

1562

1563 1564 -class CheckboxValues(SetMixin):

1565 """ 1566 Represents the values of the checked checkboxes in a group of 1567 checkboxes with the same name. 1568 """ 1569

1570 - def __init__(self, group):

1571 self.group = group

1572

1573 - def __iter__(self):

1574 return iter([ 1575 el.get('value') 1576 for el in self.group 1577 if 'checked' in el.attrib])

1578

1579 - def add(self, value):

1580 for el in self.group: 1581 if el.get('value') == value: 1582 el.set('checked', '') 1583 break 1584 else: 1585 raise KeyError("No checkbox with value %r" % value)

1586

1587 - def remove(self, value):

1588 for el in self.group: 1589 if el.get('value') == value: 1590 if 'checked' in el.attrib: 1591 del el.attrib['checked'] 1592 else: 1593 raise KeyError( 1594 "The checkbox with value %r was already unchecked" % value) 1595 break 1596 else: 1597 raise KeyError( 1598 "No checkbox with value %r" % value)

1599

1600 - def __repr__(self):

1601 return '<%s {%s} for checkboxes name=%r>' % ( 1602 self.__class__.__name__, 1603 ', '.join([repr(v) for v in self]), 1604 self.group.name)

1605

1606 1607 -class InputElement(InputMixin, HtmlElement):

1608 """ 1609 Represents an ``<input>`` element. 1610 1611 You can get the type with ``.type`` (which is lower-cased and 1612 defaults to ``'text'``). 1613 1614 Also you can get and set the value with ``.value`` 1615 1616 Checkboxes and radios have the attribute ``input.checkable == 1617 True`` (for all others it is false) and a boolean attribute 1618 ``.checked``. 1619 1620 """ 1621 1622 ## FIXME: I'm a little uncomfortable with the use of .checked 1623 @property

1624 - def value(self):

1625 """ 1626 Get/set the value of this element, using the ``value`` attribute. 1627 1628 Also, if this is a checkbox and it has no value, this defaults 1629 to ``'on'``. If it is a checkbox or radio that is not 1630 checked, this returns None. 1631 """ 1632 if self.checkable: 1633 if self.checked: 1634 return self.get('value') or 'on' 1635 else: 1636 return None 1637 return self.get('value')

1638 1639 @value.setter

1640 - def value(self, value):

1641 if self.checkable: 1642 if not value: 1643 self.checked = False 1644 else: 1645 self.checked = True 1646 if isinstance(value, basestring): 1647 self.set('value', value) 1648 else: 1649 self.set('value', value)

1650 1651 @value.deleter

1652 - def value(self):

1653 if self.checkable: 1654 self.checked = False 1655 else: 1656 if 'value' in self.attrib: 1657 del self.attrib['value']

1658 1659 @property

1660 - def type(self):

1661 """ 1662 Return the type of this element (using the type attribute). 1663 """ 1664 return self.get('type', 'text').lower()

1665 1666 @type.setter

1667 - def type(self, value):

1668 self.set('type', value)

1669 1670 @property

1671 - def checkable(self):

1672 """ 1673 Boolean: can this element be checked? 1674 """ 1675 return self.type in ('checkbox', 'radio')

1676 1677 @property

1678 - def checked(self):

1679 """ 1680 Boolean attribute to get/set the presence of the ``checked`` 1681 attribute. 1682 1683 You can only use this on checkable input types. 1684 """ 1685 if not self.checkable: 1686 raise AttributeError('Not a checkable input type') 1687 return 'checked' in self.attrib

1688 1689 @checked.setter

1690 - def checked(self, value):

1691 if not self.checkable: 1692 raise AttributeError('Not a checkable input type') 1693 if value: 1694 self.set('checked', '') 1695 else: 1696 attrib = self.attrib 1697 if 'checked' in attrib: 1698 del attrib['checked']

1699 1700 1701 HtmlElementClassLookup._default_element_classes['input'] = InputElement

1702 1703 1704 -class LabelElement(HtmlElement):

1705 """ 1706 Represents a ``<label>`` element. 1707 1708 Label elements are linked to other elements with their ``for`` 1709 attribute. You can access this element with ``label.for_element``. 1710 """ 1711 @property

1712 - def for_element(self):

1713 """ 1714 Get/set the element this label points to. Return None if it 1715 can't be found. 1716 """ 1717 id = self.get('for') 1718 if not id: 1719 return None 1720 return self.body.get_element_by_id(id)

1721 1722 @for_element.setter

1723 - def for_element(self, other):

1724 id = other.get('id') 1725 if not id: 1726 raise TypeError( 1727 "Element %r has no id attribute" % other) 1728 self.set('for', id)

1729 1730 @for_element.deleter

1731 - def for_element(self):

1732 attrib = self.attrib 1733 if 'id' in attrib: 1734 del attrib['id']

1735 1736 1737 HtmlElementClassLookup._default_element_classes['label'] = LabelElement

1738 1739 1740 ############################################################ 1741 ## Serialization 1742 ############################################################ 1743 1744 -def html_to_xhtml(html):

1745 """Convert all tags in an HTML tree to XHTML by moving them to the 1746 XHTML namespace. 1747 """ 1748 try: 1749 html = html.getroot() 1750 except AttributeError: 1751 pass 1752 prefix = "{%s}" % XHTML_NAMESPACE 1753 for el in html.iter(etree.Element): 1754 tag = el.tag 1755 if tag[0] != '{': 1756 el.tag = prefix + tag

1757

1758 1759 -def xhtml_to_html(xhtml):

1760 """Convert all tags in an XHTML tree to HTML by removing their 1761 XHTML namespace. 1762 """ 1763 try: 1764 xhtml = xhtml.getroot() 1765 except AttributeError: 1766 pass 1767 prefix = "{%s}" % XHTML_NAMESPACE 1768 prefix_len = len(prefix) 1769 for el in xhtml.iter(prefix + "*"): 1770 el.tag = el.tag[prefix_len:]

1771 1772 1773 # This isn't a general match, but it's a match for what libxml2 1774 # specifically serialises: 1775 __str_replace_meta_content_type = re.compile( 1776 r'<meta http-equiv="Content-Type"[^>]*>').sub 1777 __bytes_replace_meta_content_type = re.compile( 1778 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub

1779 1780 1781 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1782 encoding=None, method="html", with_tail=True, doctype=None):

1783 """Return an HTML string representation of the document. 1784 1785 Note: if include_meta_content_type is true this will create a 1786 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1787 regardless of the value of include_meta_content_type any existing 1788 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1789 1790 The ``encoding`` argument controls the output encoding (defauts to 1791 ASCII, with &#...; character references for any characters outside 1792 of ASCII). Note that you can pass the name ``'unicode'`` as 1793 ``encoding`` argument to serialise to a Unicode string. 1794 1795 The ``method`` argument defines the output method. It defaults to 1796 'html', but can also be 'xml' for xhtml output, or 'text' to 1797 serialise to plain text without markup. 1798 1799 To leave out the tail text of the top-level element that is being 1800 serialised, pass ``with_tail=False``. 1801 1802 The ``doctype`` option allows passing in a plain string that will 1803 be serialised before the XML tree. Note that passing in non 1804 well-formed content here will make the XML output non well-formed. 1805 Also, an existing doctype in the document tree will not be removed 1806 when serialising an ElementTree instance. 1807 1808 Example:: 1809 1810 >>> from lxml import html 1811 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1812 1813 >>> html.tostring(root) 1814 b'<p>Hello<br>world!</p>' 1815 >>> html.tostring(root, method='html') 1816 b'<p>Hello<br>world!</p>' 1817 1818 >>> html.tostring(root, method='xml') 1819 b'<p>Hello<br/>world!</p>' 1820 1821 >>> html.tostring(root, method='text') 1822 b'Helloworld!' 1823 1824 >>> html.tostring(root, method='text', encoding='unicode') 1825 u'Helloworld!' 1826 1827 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1828 >>> html.tostring(root[0], method='text', encoding='unicode') 1829 u'Helloworld!TAIL' 1830 1831 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1832 u'Helloworld!' 1833 1834 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1835 >>> html.tostring(doc, method='html', encoding='unicode') 1836 u'<html><body><p>Hello<br>world!</p></body></html>' 1837 1838 >>> print(html.tostring(doc, method='html', encoding='unicode', 1839 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1840 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1841 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1842 <html><body><p>Hello<br>world!</p></body></html> 1843 """ 1844 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1845 encoding=encoding, with_tail=with_tail, 1846 doctype=doctype) 1847 if method == 'html' and not include_meta_content_type: 1848 if isinstance(html, str): 1849 html = __str_replace_meta_content_type('', html) 1850 else: 1851 html = __bytes_replace_meta_content_type(bytes(), html) 1852 return html

1853 1854 1855 tostring.__doc__ = __fix_docstring(tostring.__doc__)

1856 1857 1858 -def open_in_browser(doc, encoding=None):

1859 """ 1860 Open the HTML document in a web browser, saving it to a temporary 1861 file to open it. Note that this does not delete the file after 1862 use. This is mainly meant for debugging. 1863 """ 1864 import os 1865 import webbrowser 1866 import tempfile 1867 if not isinstance(doc, etree._ElementTree): 1868 doc = etree.ElementTree(doc) 1869 handle, fn = tempfile.mkstemp(suffix='.html') 1870 f = os.fdopen(handle, 'wb') 1871 try: 1872 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1873 finally: 1874 # we leak the file itself here, but we should at least close it 1875 f.close() 1876 url = 'file://' + fn.replace(os.path.sep, '/') 1877 print(url) 1878 webbrowser.open(url)

1879

1880 1881 ################################################################################ 1882 # configure Element class lookup 1883 ################################################################################ 1884 1885 -class HTMLParser(etree.HTMLParser):

1886 """An HTML parser that is configured to return lxml.html Element 1887 objects. 1888 """

1889 - def __init__(self, **kwargs):

1890 super(HTMLParser, self).__init__(**kwargs) 1891 self.set_element_class_lookup(HtmlElementClassLookup())

1892

1893 1894 -class XHTMLParser(etree.XMLParser):

1895 """An XML parser that is configured to return lxml.html Element 1896 objects. 1897 1898 Note that this parser is not really XHTML aware unless you let it 1899 load a DTD that declares the HTML entities. To do this, make sure 1900 you have the XHTML DTDs installed in your catalogs, and create the 1901 parser like this:: 1902 1903 >>> parser = XHTMLParser(load_dtd=True) 1904 1905 If you additionally want to validate the document, use this:: 1906 1907 >>> parser = XHTMLParser(dtd_validation=True) 1908 1909 For catalog support, see http://www.xmlsoft.org/catalog.html. 1910 """

1911 - def __init__(self, **kwargs):

1912 super(XHTMLParser, self).__init__(**kwargs) 1913 self.set_element_class_lookup(HtmlElementClassLookup())

1914

1915 1916 -def Element(*args, **kw):

1917 """Create a new HTML Element. 1918 1919 This can also be used for XHTML documents. 1920 """ 1921 v = html_parser.makeelement(*args, **kw) 1922 return v

1923 1924 1925 html_parser = HTMLParser() 1926 xhtml_parser = XHTMLParser() 1927

Source Code for Package lxml.html