lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 from __future__ import absolute_import 35 36 __all__ = [ 37 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 38 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 39 'find_rel_links', 'find_class', 'make_links_absolute', 40 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 41 42 43 import copy 44 import sys 45 import re 46 from functools import partial 47 48 try: 49 # while unnecessary, importing from 'collections.abc' is the right way to do it 50 from collections.abc import MutableMapping, MutableSet 51 except ImportError: 52 from collections import MutableMapping, MutableSet 53 54 from .. import etree 55 from . import defs 56 from ._setmixin import SetMixin 57 58 try: 59 from urlparse import urljoin 60 except ImportError: 61 # Python 3 62 from urllib.parse import urljoin 63 64 try: 65 unicode 66 except NameError: 67 # Python 3 68 unicode = str 69 try: 70 basestring 71 except NameError: 72 # Python 3 73 basestring = (str, bytes)

74 75 76 -def __fix_docstring(s):

77 if not s: 78 return s 79 if sys.version_info[0] >= 3: 80 sub = re.compile(r"^(\s*)u'", re.M).sub 81 else: 82 sub = re.compile(r"^(\s*)b'", re.M).sub 83 return sub(r"\1'", s)

84 85 86 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 87 88 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 89 namespaces={'x':XHTML_NAMESPACE}) 90 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 91 namespaces={'x':XHTML_NAMESPACE}) 92 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 93 namespaces={'x':XHTML_NAMESPACE}) 94 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 95 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 96 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 97 _collect_string_content = etree.XPath("string()") 98 _iter_css_urls = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I).finditer 99 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer 100 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 101 namespaces={'x':XHTML_NAMESPACE}) 102 _archive_re = re.compile(r'[^ ]+') 103 _parse_meta_refresh_url = re.compile( 104 r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search

105 106 107 -def _unquote_match(s, pos):

108 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 109 return s[1:-1], pos+1 110 else: 111 return s,pos

112

113 114 -def _transform_result(typ, result):

115 """Convert the result back into the input type. 116 """ 117 if issubclass(typ, bytes): 118 return tostring(result, encoding='utf-8') 119 elif issubclass(typ, unicode): 120 return tostring(result, encoding='unicode') 121 else: 122 return result

123

124 125 -def _nons(tag):

126 if isinstance(tag, basestring): 127 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 128 return tag.split('}')[-1] 129 return tag

130

131 132 -class Classes(MutableSet):

133 """Provides access to an element's class attribute as a set-like collection. 134 Usage:: 135 136 >>> el = fromstring('<p class="hidden large">Text</p>') 137 >>> classes = el.classes # or: classes = Classes(el.attrib) 138 >>> classes |= ['block', 'paragraph'] 139 >>> el.get('class') 140 'hidden large block paragraph' 141 >>> classes.toggle('hidden') 142 False 143 >>> el.get('class') 144 'large block paragraph' 145 >>> classes -= ('some', 'classes', 'block') 146 >>> el.get('class') 147 'large paragraph' 148 """

149 - def __init__(self, attributes):

150 self._attributes = attributes 151 self._get_class_value = partial(attributes.get, 'class', '')

152

153 - def add(self, value):

154 """ 155 Add a class. 156 157 This has no effect if the class is already present. 158 """ 159 if not value or re.search(r'\s', value): 160 raise ValueError("Invalid class name: %r" % value) 161 classes = self._get_class_value().split() 162 if value in classes: 163 return 164 classes.append(value) 165 self._attributes['class'] = ' '.join(classes)

166

167 - def discard(self, value):

168 """ 169 Remove a class if it is currently present. 170 171 If the class is not present, do nothing. 172 """ 173 if not value or re.search(r'\s', value): 174 raise ValueError("Invalid class name: %r" % value) 175 classes = [name for name in self._get_class_value().split() 176 if name != value] 177 if classes: 178 self._attributes['class'] = ' '.join(classes) 179 elif 'class' in self._attributes: 180 del self._attributes['class']

181

182 - def remove(self, value):

183 """ 184 Remove a class; it must currently be present. 185 186 If the class is not present, raise a KeyError. 187 """ 188 if not value or re.search(r'\s', value): 189 raise ValueError("Invalid class name: %r" % value) 190 super(Classes, self).remove(value)

191

192 - def __contains__(self, name):

193 classes = self._get_class_value() 194 return name in classes and name in classes.split()

195

196 - def __iter__(self):

197 return iter(self._get_class_value().split())

198

199 - def __len__(self):

200 return len(self._get_class_value().split())

201 202 # non-standard methods 203

204 - def update(self, values):

205 """ 206 Add all names from 'values'. 207 """ 208 classes = self._get_class_value().split() 209 extended = False 210 for value in values: 211 if value not in classes: 212 classes.append(value) 213 extended = True 214 if extended: 215 self._attributes['class'] = ' '.join(classes)

216

217 - def toggle(self, value):

218 """ 219 Add a class name if it isn't there yet, or remove it if it exists. 220 221 Returns true if the class was added (and is now enabled) and 222 false if it was removed (and is now disabled). 223 """ 224 if not value or re.search(r'\s', value): 225 raise ValueError("Invalid class name: %r" % value) 226 classes = self._get_class_value().split() 227 try: 228 classes.remove(value) 229 enabled = False 230 except ValueError: 231 classes.append(value) 232 enabled = True 233 if classes: 234 self._attributes['class'] = ' '.join(classes) 235 else: 236 del self._attributes['class'] 237 return enabled

238

239 240 -class HtmlMixin(object):

241 242 @property

243 - def classes(self):

244 """ 245 A set-like wrapper around the 'class' attribute. 246 """ 247 return Classes(self.attrib)

248 249 @classes.setter

250 - def classes(self, classes):

251 assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. 252 value = classes._get_class_value() 253 if value: 254 self.set('class', value) 255 elif self.get('class') is not None: 256 del self.attrib['class']

257 258 @property

259 - def base_url(self):

260 """ 261 Returns the base URL, given when the page was parsed. 262 263 Use with ``urlparse.urljoin(el.base_url, href)`` to get 264 absolute URLs. 265 """ 266 return self.getroottree().docinfo.URL

267 268 @property

269 - def forms(self):

270 """ 271 Return a list of all the forms 272 """ 273 return _forms_xpath(self)

274 275 @property

276 - def body(self):

277 """ 278 Return the <body> element. Can be called from a child element 279 to get the document's head. 280 """ 281 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

282 283 @property

284 - def head(self):

285 """ 286 Returns the <head> element. Can be called from a child 287 element to get the document's head. 288 """ 289 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

290 291 @property

292 - def label(self):

293 """ 294 Get or set any <label> element associated with this element. 295 """ 296 id = self.get('id') 297 if not id: 298 return None 299 result = _label_xpath(self, id=id) 300 if not result: 301 return None 302 else: 303 return result[0]

304 305 @label.setter

306 - def label(self, label):

307 id = self.get('id') 308 if not id: 309 raise TypeError( 310 "You cannot set a label for an element (%r) that has no id" 311 % self) 312 if _nons(label.tag) != 'label': 313 raise TypeError( 314 "You can only assign label to a label element (not %r)" 315 % label) 316 label.set('for', id)

317 318 @label.deleter

319 - def label(self):

320 label = self.label 321 if label is not None: 322 del label.attrib['for']

323

324 - def drop_tree(self):

325 """ 326 Removes this element from the tree, including its children and 327 text. The tail text is joined to the previous element or 328 parent. 329 """ 330 parent = self.getparent() 331 assert parent is not None 332 if self.tail: 333 previous = self.getprevious() 334 if previous is None: 335 parent.text = (parent.text or '') + self.tail 336 else: 337 previous.tail = (previous.tail or '') + self.tail 338 parent.remove(self)

339

340 - def drop_tag(self):

341 """ 342 Remove the tag, but not its children or text. The children and text 343 are merged into the parent. 344 345 Example:: 346 347 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 348 >>> h.find('.//b').drop_tag() 349 >>> print(tostring(h, encoding='unicode')) 350 <div>Hello World!</div> 351 """ 352 parent = self.getparent() 353 assert parent is not None 354 previous = self.getprevious() 355 if self.text and isinstance(self.tag, basestring): 356 # not a Comment, etc. 357 if previous is None: 358 parent.text = (parent.text or '') + self.text 359 else: 360 previous.tail = (previous.tail or '') + self.text 361 if self.tail: 362 if len(self): 363 last = self[-1] 364 last.tail = (last.tail or '') + self.tail 365 elif previous is None: 366 parent.text = (parent.text or '') + self.tail 367 else: 368 previous.tail = (previous.tail or '') + self.tail 369 index = parent.index(self) 370 parent[index:index+1] = self[:]

371

372 - def find_rel_links(self, rel):

373 """ 374 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 375 """ 376 rel = rel.lower() 377 return [el for el in _rel_links_xpath(self) 378 if el.get('rel').lower() == rel]

379

380 - def find_class(self, class_name):

381 """ 382 Find any elements with the given class name. 383 """ 384 return _class_xpath(self, class_name=class_name)

385

386 - def get_element_by_id(self, id, *default):

387 """ 388 Get the first element in a document with the given id. If none is 389 found, return the default argument if provided or raise KeyError 390 otherwise. 391 392 Note that there can be more than one element with the same id, 393 and this isn't uncommon in HTML documents found in the wild. 394 Browsers return only the first match, and this function does 395 the same. 396 """ 397 try: 398 # FIXME: should this check for multiple matches? 399 # browsers just return the first one 400 return _id_xpath(self, id=id)[0] 401 except IndexError: 402 if default: 403 return default[0] 404 else: 405 raise KeyError(id)

406

407 - def text_content(self):

408 """ 409 Return the text content of the tag (and the text in any children). 410 """ 411 return _collect_string_content(self)

412

413 - def cssselect(self, expr, translator='html'):

414 """ 415 Run the CSS expression on this element and its children, 416 returning a list of the results. 417 418 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 419 -- note that pre-compiling the expression can provide a substantial 420 speedup. 421 """ 422 # Do the import here to make the dependency optional. 423 from lxml.cssselect import CSSSelector 424 return CSSSelector(expr, translator=translator)(self)

425 426 ######################################## 427 ## Link functions 428 ######################################## 429

430 - def make_links_absolute(self, base_url=None, resolve_base_href=True, 431 handle_failures=None):

432 """ 433 Make all links in the document absolute, given the 434 ``base_url`` for the document (the full URL where the document 435 came from), or if no ``base_url`` is given, then the ``.base_url`` 436 of the document. 437 438 If ``resolve_base_href`` is true, then any ``<base href>`` 439 tags in the document are used *and* removed from the document. 440 If it is false then any such tag is ignored. 441 442 If ``handle_failures`` is None (default), a failure to process 443 a URL will abort the processing. If set to 'ignore', errors 444 are ignored. If set to 'discard', failing URLs will be removed. 445 """ 446 if base_url is None: 447 base_url = self.base_url 448 if base_url is None: 449 raise TypeError( 450 "No base_url given, and the document has no base_url") 451 if resolve_base_href: 452 self.resolve_base_href() 453 454 if handle_failures == 'ignore': 455 def link_repl(href): 456 try: 457 return urljoin(base_url, href) 458 except ValueError: 459 return href

460 elif handle_failures == 'discard': 461 def link_repl(href): 462 try: 463 return urljoin(base_url, href) 464 except ValueError: 465 return None

466 elif handle_failures is None: 467 def link_repl(href): 468 return urljoin(base_url, href) 469 else: 470 raise ValueError( 471 "unexpected value for handle_failures: %r" % handle_failures) 472 473 self.rewrite_links(link_repl) 474

475 - def resolve_base_href(self, handle_failures=None):

476 """ 477 Find any ``<base href>`` tag in the document, and apply its 478 values to all links found in the document. Also remove the 479 tag once it has been applied. 480 481 If ``handle_failures`` is None (default), a failure to process 482 a URL will abort the processing. If set to 'ignore', errors 483 are ignored. If set to 'discard', failing URLs will be removed. 484 """ 485 base_href = None 486 basetags = self.xpath('//base[@href]|//x:base[@href]', 487 namespaces={'x': XHTML_NAMESPACE}) 488 for b in basetags: 489 base_href = b.get('href') 490 b.drop_tree() 491 if not base_href: 492 return 493 self.make_links_absolute(base_href, resolve_base_href=False, 494 handle_failures=handle_failures)

495

496 - def iterlinks(self):

497 """ 498 Yield (element, attribute, link, pos), where attribute may be None 499 (indicating the link is in the text). ``pos`` is the position 500 where the link occurs; often 0, but sometimes something else in 501 the case of links in stylesheets or style tags. 502 503 Note: <base href> is *not* taken into account in any way. The 504 link you get is exactly the link in the document. 505 506 Note: multiple links inside of a single text string or 507 attribute value are returned in reversed order. This makes it 508 possible to replace or delete them from the text string value 509 based on their reported text positions. Otherwise, a 510 modification at one text position can change the positions of 511 links reported later on. 512 """ 513 link_attrs = defs.link_attrs 514 for el in self.iter(etree.Element): 515 attribs = el.attrib 516 tag = _nons(el.tag) 517 if tag == 'object': 518 codebase = None 519 ## <object> tags have attributes that are relative to 520 ## codebase 521 if 'codebase' in attribs: 522 codebase = el.get('codebase') 523 yield (el, 'codebase', codebase, 0) 524 for attrib in ('classid', 'data'): 525 if attrib in attribs: 526 value = el.get(attrib) 527 if codebase is not None: 528 value = urljoin(codebase, value) 529 yield (el, attrib, value, 0) 530 if 'archive' in attribs: 531 for match in _archive_re.finditer(el.get('archive')): 532 value = match.group(0) 533 if codebase is not None: 534 value = urljoin(codebase, value) 535 yield (el, 'archive', value, match.start()) 536 else: 537 for attrib in link_attrs: 538 if attrib in attribs: 539 yield (el, attrib, attribs[attrib], 0) 540 if tag == 'meta': 541 http_equiv = attribs.get('http-equiv', '').lower() 542 if http_equiv == 'refresh': 543 content = attribs.get('content', '') 544 match = _parse_meta_refresh_url(content) 545 url = (match.group('url') if match else content).strip() 546 # unexpected content means the redirect won't work, but we might 547 # as well be permissive and return the entire string. 548 if url: 549 url, pos = _unquote_match( 550 url, match.start('url') if match else content.find(url)) 551 yield (el, 'content', url, pos) 552 elif tag == 'param': 553 valuetype = el.get('valuetype') or '' 554 if valuetype.lower() == 'ref': 555 ## FIXME: while it's fine we *find* this link, 556 ## according to the spec we aren't supposed to 557 ## actually change the value, including resolving 558 ## it. It can also still be a link, even if it 559 ## doesn't have a valuetype="ref" (which seems to be the norm) 560 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 561 yield (el, 'value', el.get('value'), 0) 562 elif tag == 'style' and el.text: 563 urls = [ 564 # (start_pos, url) 565 _unquote_match(match.group(1), match.start(1))[::-1] 566 for match in _iter_css_urls(el.text) 567 ] + [ 568 (match.start(1), match.group(1)) 569 for match in _iter_css_imports(el.text) 570 ] 571 if urls: 572 # sort by start pos to bring both match sets back into order 573 # and reverse the list to report correct positions despite 574 # modifications 575 urls.sort(reverse=True) 576 for start, url in urls: 577 yield (el, None, url, start) 578 if 'style' in attribs: 579 urls = list(_iter_css_urls(attribs['style'])) 580 if urls: 581 # return in reversed order to simplify in-place modifications 582 for match in urls[::-1]: 583 url, start = _unquote_match(match.group(1), match.start(1)) 584 yield (el, 'style', url, start)

585

586 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 587 base_href=None):

588 """ 589 Rewrite all the links in the document. For each link 590 ``link_repl_func(link)`` will be called, and the return value 591 will replace the old link. 592 593 Note that links may not be absolute (unless you first called 594 ``make_links_absolute()``), and may be internal (e.g., 595 ``'#anchor'``). They can also be values like 596 ``'mailto:email'`` or ``'javascript:expr'``. 597 598 If you give ``base_href`` then all links passed to 599 ``link_repl_func()`` will take that into account. 600 601 If the ``link_repl_func`` returns None, the attribute or 602 tag text will be removed completely. 603 """ 604 if base_href is not None: 605 # FIXME: this can be done in one pass with a wrapper 606 # around link_repl_func 607 self.make_links_absolute( 608 base_href, resolve_base_href=resolve_base_href) 609 elif resolve_base_href: 610 self.resolve_base_href() 611 612 for el, attrib, link, pos in self.iterlinks(): 613 new_link = link_repl_func(link.strip()) 614 if new_link == link: 615 continue 616 if new_link is None: 617 # Remove the attribute or element content 618 if attrib is None: 619 el.text = '' 620 else: 621 del el.attrib[attrib] 622 continue 623 624 if attrib is None: 625 new = el.text[:pos] + new_link + el.text[pos+len(link):] 626 el.text = new 627 else: 628 cur = el.get(attrib) 629 if not pos and len(cur) == len(link): 630 new = new_link # most common case 631 else: 632 new = cur[:pos] + new_link + cur[pos+len(link):] 633 el.set(attrib, new)

634

635 636 -class _MethodFunc(object):

637 """ 638 An object that represents a method on an element as a function; 639 the function takes either an element or an HTML string. It 640 returns whatever the function normally returns, or if the function 641 works in-place (and so returns None) it returns a serialized form 642 of the resulting document. 643 """

644 - def __init__(self, name, copy=False, source_class=HtmlMixin):

645 self.name = name 646 self.copy = copy 647 self.__doc__ = getattr(source_class, self.name).__doc__

648 - def __call__(self, doc, *args, **kw):

649 result_type = type(doc) 650 if isinstance(doc, basestring): 651 if 'copy' in kw: 652 raise TypeError( 653 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 654 doc = fromstring(doc, **kw) 655 else: 656 if 'copy' in kw: 657 make_a_copy = kw.pop('copy') 658 else: 659 make_a_copy = self.copy 660 if make_a_copy: 661 doc = copy.deepcopy(doc) 662 meth = getattr(doc, self.name) 663 result = meth(*args, **kw) 664 # FIXME: this None test is a bit sloppy 665 if result is None: 666 # Then return what we got in 667 return _transform_result(result_type, doc) 668 else: 669 return result

670 671 672 find_rel_links = _MethodFunc('find_rel_links', copy=False) 673 find_class = _MethodFunc('find_class', copy=False) 674 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 675 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 676 iterlinks = _MethodFunc('iterlinks', copy=False) 677 rewrite_links = _MethodFunc('rewrite_links', copy=True)

678 679 680 -class HtmlComment(etree.CommentBase, HtmlMixin):

681 pass

682

683 684 -class HtmlElement(etree.ElementBase, HtmlMixin):

685 # Override etree.ElementBase.cssselect, despite the MRO 686 cssselect = HtmlMixin.cssselect

687

688 689 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

690 pass

691

692 693 -class HtmlEntity(etree.EntityBase, HtmlMixin):

694 pass

695

696 697 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

698 """A lookup scheme for HTML Element classes. 699 700 To create a lookup instance with different Element classes, pass a tag 701 name mapping of Element classes in the ``classes`` keyword argument and/or 702 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 703 The special key '*' denotes a Mixin class that should be mixed into all 704 Element classes. 705 """ 706 _default_element_classes = {} 707

708 - def __init__(self, classes=None, mixins=None):

709 etree.CustomElementClassLookup.__init__(self) 710 if classes is None: 711 classes = self._default_element_classes.copy() 712 if mixins: 713 mixers = {} 714 for name, value in mixins: 715 if name == '*': 716 for n in classes.keys(): 717 mixers.setdefault(n, []).append(value) 718 else: 719 mixers.setdefault(name, []).append(value) 720 for name, mix_bases in mixers.items(): 721 cur = classes.get(name, HtmlElement) 722 bases = tuple(mix_bases + [cur]) 723 classes[name] = type(cur.__name__, bases, {}) 724 self._element_classes = classes

725

726 - def lookup(self, node_type, document, namespace, name):

727 if node_type == 'element': 728 return self._element_classes.get(name.lower(), HtmlElement) 729 elif node_type == 'comment': 730 return HtmlComment 731 elif node_type == 'PI': 732 return HtmlProcessingInstruction 733 elif node_type == 'entity': 734 return HtmlEntity 735 # Otherwise normal lookup 736 return None

737 738 739 ################################################################################ 740 # parsing 741 ################################################################################ 742 743 _looks_like_full_html_unicode = re.compile( 744 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 745 _looks_like_full_html_bytes = re.compile( 746 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match

747 748 749 -def document_fromstring(html, parser=None, ensure_head_body=False, **kw):

750 if parser is None: 751 parser = html_parser 752 value = etree.fromstring(html, parser, **kw) 753 if value is None: 754 raise etree.ParserError( 755 "Document is empty") 756 if ensure_head_body and value.find('head') is None: 757 value.insert(0, Element('head')) 758 if ensure_head_body and value.find('body') is None: 759 value.append(Element('body')) 760 return value

761

762 763 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 764 parser=None, **kw):

765 """ 766 Parses several HTML elements, returning a list of elements. 767 768 The first item in the list may be a string (though leading 769 whitespace is removed). If no_leading_text is true, then it will 770 be an error if there is leading text, and it will always be a list 771 of only elements. 772 773 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 774 """ 775 if parser is None: 776 parser = html_parser 777 # FIXME: check what happens when you give html with a body, head, etc. 778 if isinstance(html, bytes): 779 if not _looks_like_full_html_bytes(html): 780 # can't use %-formatting in early Py3 versions 781 html = ('<html><body>'.encode('ascii') + html + 782 '</body></html>'.encode('ascii')) 783 else: 784 if not _looks_like_full_html_unicode(html): 785 html = '<html><body>%s</body></html>' % html 786 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 787 assert _nons(doc.tag) == 'html' 788 bodies = [e for e in doc if _nons(e.tag) == 'body'] 789 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 790 body = bodies[0] 791 elements = [] 792 if no_leading_text and body.text and body.text.strip(): 793 raise etree.ParserError( 794 "There is leading text: %r" % body.text) 795 if body.text and body.text.strip(): 796 elements.append(body.text) 797 elements.extend(body) 798 # FIXME: removing the reference to the parent artificial document 799 # would be nice 800 return elements

801

802 803 -def fragment_fromstring(html, create_parent=False, base_url=None, 804 parser=None, **kw):

805 """ 806 Parses a single HTML element; it is an error if there is more than 807 one element, or if anything but whitespace precedes or follows the 808 element. 809 810 If ``create_parent`` is true (or is a tag name) then a parent node 811 will be created to encapsulate the HTML in a single element. In this 812 case, leading or trailing text is also allowed, as are multiple elements 813 as result of the parsing. 814 815 Passing a ``base_url`` will set the document's ``base_url`` attribute 816 (and the tree's docinfo.URL). 817 """ 818 if parser is None: 819 parser = html_parser 820 821 accept_leading_text = bool(create_parent) 822 823 elements = fragments_fromstring( 824 html, parser=parser, no_leading_text=not accept_leading_text, 825 base_url=base_url, **kw) 826 827 if create_parent: 828 if not isinstance(create_parent, basestring): 829 create_parent = 'div' 830 new_root = Element(create_parent) 831 if elements: 832 if isinstance(elements[0], basestring): 833 new_root.text = elements[0] 834 del elements[0] 835 new_root.extend(elements) 836 return new_root 837 838 if not elements: 839 raise etree.ParserError('No elements found') 840 if len(elements) > 1: 841 raise etree.ParserError( 842 "Multiple elements found (%s)" 843 % ', '.join([_element_name(e) for e in elements])) 844 el = elements[0] 845 if el.tail and el.tail.strip(): 846 raise etree.ParserError( 847 "Element followed by text: %r" % el.tail) 848 el.tail = None 849 return el

850

851 852 -def fromstring(html, base_url=None, parser=None, **kw):

853 """ 854 Parse the html, returning a single element/document. 855 856 This tries to minimally parse the chunk of text, without knowing if it 857 is a fragment or a document. 858 859 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 860 """ 861 if parser is None: 862 parser = html_parser 863 if isinstance(html, bytes): 864 is_full_html = _looks_like_full_html_bytes(html) 865 else: 866 is_full_html = _looks_like_full_html_unicode(html) 867 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 868 if is_full_html: 869 return doc 870 # otherwise, lets parse it out... 871 bodies = doc.findall('body') 872 if not bodies: 873 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 874 if bodies: 875 body = bodies[0] 876 if len(bodies) > 1: 877 # Somehow there are multiple bodies, which is bad, but just 878 # smash them into one body 879 for other_body in bodies[1:]: 880 if other_body.text: 881 if len(body): 882 body[-1].tail = (body[-1].tail or '') + other_body.text 883 else: 884 body.text = (body.text or '') + other_body.text 885 body.extend(other_body) 886 # We'll ignore tail 887 # I guess we are ignoring attributes too 888 other_body.drop_tree() 889 else: 890 body = None 891 heads = doc.findall('head') 892 if not heads: 893 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 894 if heads: 895 # Well, we have some sort of structure, so lets keep it all 896 head = heads[0] 897 if len(heads) > 1: 898 for other_head in heads[1:]: 899 head.extend(other_head) 900 # We don't care about text or tail in a head 901 other_head.drop_tree() 902 return doc 903 if body is None: 904 return doc 905 if (len(body) == 1 and (not body.text or not body.text.strip()) 906 and (not body[-1].tail or not body[-1].tail.strip())): 907 # The body has just one element, so it was probably a single 908 # element passed in 909 return body[0] 910 # Now we have a body which represents a bunch of tags which have the 911 # content that was passed in. We will create a fake container, which 912 # is the body tag, except <body> implies too much structure. 913 if _contains_block_level_tag(body): 914 body.tag = 'div' 915 else: 916 body.tag = 'span' 917 return body

918

919 920 -def parse(filename_or_url, parser=None, base_url=None, **kw):

921 """ 922 Parse a filename, URL, or file-like object into an HTML document 923 tree. Note: this returns a tree, not an element. Use 924 ``parse(...).getroot()`` to get the document root. 925 926 You can override the base URL with the ``base_url`` keyword. This 927 is most useful when parsing from a file-like object. 928 """ 929 if parser is None: 930 parser = html_parser 931 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

932

933 934 -def _contains_block_level_tag(el):

935 # FIXME: I could do this with XPath, but would that just be 936 # unnecessarily slow? 937 for el in el.iter(etree.Element): 938 if _nons(el.tag) in defs.block_tags: 939 return True 940 return False

941

942 943 -def _element_name(el):

944 if isinstance(el, etree.CommentBase): 945 return 'comment' 946 elif isinstance(el, basestring): 947 return 'string' 948 else: 949 return _nons(el.tag)

950

951 952 ################################################################################ 953 # form handling 954 ################################################################################ 955 956 -class FormElement(HtmlElement):

957 """ 958 Represents a <form> element. 959 """ 960 961 @property

962 - def inputs(self):

963 """ 964 Returns an accessor for all the input elements in the form. 965 966 See `InputGetter` for more information about the object. 967 """ 968 return InputGetter(self)

969 970 @property

971 - def fields(self):

972 """ 973 Dictionary-like object that represents all the fields in this 974 form. You can set values in this dictionary to effect the 975 form. 976 """ 977 return FieldsDict(self.inputs)

978 979 @fields.setter

980 - def fields(self, value):

981 fields = self.fields 982 prev_keys = fields.keys() 983 for key, value in value.items(): 984 if key in prev_keys: 985 prev_keys.remove(key) 986 fields[key] = value 987 for key in prev_keys: 988 if key is None: 989 # Case of an unnamed input; these aren't really 990 # expressed in form_values() anyway. 991 continue 992 fields[key] = None

993

994 - def _name(self):

995 if self.get('name'): 996 return self.get('name') 997 elif self.get('id'): 998 return '#' + self.get('id') 999 iter_tags = self.body.iter 1000 forms = list(iter_tags('form')) 1001 if not forms: 1002 forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE)) 1003 return str(forms.index(self))

1004

1005 - def form_values(self):

1006 """ 1007 Return a list of tuples of the field values for the form. 1008 This is suitable to be passed to ``urllib.urlencode()``. 1009 """ 1010 results = [] 1011 for el in self.inputs: 1012 name = el.name 1013 if not name: 1014 continue 1015 tag = _nons(el.tag) 1016 if tag == 'textarea': 1017 results.append((name, el.value)) 1018 elif tag == 'select': 1019 value = el.value 1020 if el.multiple: 1021 for v in value: 1022 results.append((name, v)) 1023 elif value is not None: 1024 results.append((name, el.value)) 1025 else: 1026 assert tag == 'input', ( 1027 "Unexpected tag: %r" % el) 1028 if el.checkable and not el.checked: 1029 continue 1030 if el.type in ('submit', 'image', 'reset'): 1031 continue 1032 value = el.value 1033 if value is not None: 1034 results.append((name, el.value)) 1035 return results

1036 1037 @property

1038 - def action(self):

1039 """ 1040 Get/set the form's ``action`` attribute. 1041 """ 1042 base_url = self.base_url 1043 action = self.get('action') 1044 if base_url and action is not None: 1045 return urljoin(base_url, action) 1046 else: 1047 return action

1048 1049 @action.setter

1050 - def action(self, value):

1051 self.set('action', value)

1052 1053 @action.deleter

1054 - def action(self):

1055 attrib = self.attrib 1056 if 'action' in attrib: 1057 del attrib['action']

1058 1059 @property

1060 - def method(self):

1061 """ 1062 Get/set the form's method. Always returns a capitalized 1063 string, and defaults to ``'GET'`` 1064 """ 1065 return self.get('method', 'GET').upper()

1066 1067 @method.setter

1068 - def method(self, value):

1069 self.set('method', value.upper())

1070 1071 1072 HtmlElementClassLookup._default_element_classes['form'] = FormElement

1073 1074 1075 -def submit_form(form, extra_values=None, open_http=None):

1076 """ 1077 Helper function to submit a form. Returns a file-like object, as from 1078 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 1079 which shows the URL if there were any redirects. 1080 1081 You can use this like:: 1082 1083 form = doc.forms[0] 1084 form.inputs['foo'].value = 'bar' # etc 1085 response = form.submit() 1086 doc = parse(response) 1087 doc.make_links_absolute(response.geturl()) 1088 1089 To change the HTTP requester, pass a function as ``open_http`` keyword 1090 argument that opens the URL for you. The function must have the following 1091 signature:: 1092 1093 open_http(method, URL, values) 1094 1095 The action is one of 'GET' or 'POST', the URL is the target URL as a 1096 string, and the values are a sequence of ``(name, value)`` tuples with the 1097 form data. 1098 """ 1099 values = form.form_values() 1100 if extra_values: 1101 if hasattr(extra_values, 'items'): 1102 extra_values = extra_values.items() 1103 values.extend(extra_values) 1104 if open_http is None: 1105 open_http = open_http_urllib 1106 if form.action: 1107 url = form.action 1108 else: 1109 url = form.base_url 1110 return open_http(form.method, url, values)

1111

1112 1113 -def open_http_urllib(method, url, values):

1114 if not url: 1115 raise ValueError("cannot submit, no URL provided") 1116 ## FIXME: should test that it's not a relative URL or something 1117 try: 1118 from urllib import urlencode, urlopen 1119 except ImportError: # Python 3 1120 from urllib.request import urlopen 1121 from urllib.parse import urlencode 1122 if method == 'GET': 1123 if '?' in url: 1124 url += '&' 1125 else: 1126 url += '?' 1127 url += urlencode(values) 1128 data = None 1129 else: 1130 data = urlencode(values) 1131 return urlopen(url, data)

1132

1133 1134 -class FieldsDict(MutableMapping):

1135

1136 - def __init__(self, inputs):

1137 self.inputs = inputs

1138 - def __getitem__(self, item):

1139 return self.inputs[item].value

1140 - def __setitem__(self, item, value):

1141 self.inputs[item].value = value

1142 - def __delitem__(self, item):

1143 raise KeyError( 1144 "You cannot remove keys from ElementDict")

1145 - def keys(self):

1146 return self.inputs.keys()

1147 - def __contains__(self, item):

1148 return item in self.inputs

1149 - def __iter__(self):

1150 return iter(self.inputs.keys())

1151 - def __len__(self):

1152 return len(self.inputs)

1153

1154 - def __repr__(self):

1155 return '<%s for form %s>' % ( 1156 self.__class__.__name__, 1157 self.inputs.form._name())

1158

1159 1160 -class InputGetter(object):

1161 1162 """ 1163 An accessor that represents all the input fields in a form. 1164 1165 You can get fields by name from this, with 1166 ``form.inputs['field_name']``. If there are a set of checkboxes 1167 with the same name, they are returned as a list (a `CheckboxGroup` 1168 which also allows value setting). Radio inputs are handled 1169 similarly. 1170 1171 You can also iterate over this to get all input elements. This 1172 won't return the same thing as if you get all the names, as 1173 checkboxes and radio elements are returned individually. 1174 """ 1175 1176 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 1177 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 1178

1179 - def __init__(self, form):

1180 self.form = form

1181

1182 - def __repr__(self):

1183 return '<%s for form %s>' % ( 1184 self.__class__.__name__, 1185 self.form._name())

1186 1187 ## FIXME: there should be more methods, and it's unclear if this is 1188 ## a dictionary-like object or list-like object 1189

1190 - def __getitem__(self, name):

1191 results = self._name_xpath(self.form, name=name) 1192 if results: 1193 type = results[0].get('type') 1194 if type == 'radio' and len(results) > 1: 1195 group = RadioGroup(results) 1196 group.name = name 1197 return group 1198 elif type == 'checkbox' and len(results) > 1: 1199 group = CheckboxGroup(results) 1200 group.name = name 1201 return group 1202 else: 1203 # I don't like throwing away elements like this 1204 return results[0] 1205 else: 1206 raise KeyError( 1207 "No input element with the name %r" % name)

1208

1209 - def __contains__(self, name):

1210 results = self._name_xpath(self.form, name=name) 1211 return bool(results)

1212

1213 - def keys(self):

1214 names = set() 1215 for el in self: 1216 names.add(el.name) 1217 if None in names: 1218 names.remove(None) 1219 return list(names)

1220

1221 - def __iter__(self):

1222 ## FIXME: kind of dumb to turn a list into an iterator, only 1223 ## to have it likely turned back into a list again :( 1224 return iter(self._all_xpath(self.form))

1225

1226 1227 -class InputMixin(object):

1228 """ 1229 Mix-in for all input elements (input, select, and textarea) 1230 """ 1231 @property

1232 - def name(self):

1233 """ 1234 Get/set the name of the element 1235 """ 1236 return self.get('name')

1237 1238 @name.setter

1239 - def name(self, value):

1240 self.set('name', value)

1241 1242 @name.deleter

1243 - def name(self):

1244 attrib = self.attrib 1245 if 'name' in attrib: 1246 del attrib['name']

1247

1248 - def __repr__(self):

1249 type_name = getattr(self, 'type', None) 1250 if type_name: 1251 type_name = ' type=%r' % type_name 1252 else: 1253 type_name = '' 1254 return '<%s %x name=%r%s>' % ( 1255 self.__class__.__name__, id(self), self.name, type_name)

1256

1257 1258 -class TextareaElement(InputMixin, HtmlElement):

1259 """ 1260 ``<textarea>`` element. You can get the name with ``.name`` and 1261 get/set the value with ``.value`` 1262 """ 1263 @property

1264 - def value(self):

1265 """ 1266 Get/set the value (which is the contents of this element) 1267 """ 1268 content = self.text or '' 1269 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1270 serialisation_method = 'xml' 1271 else: 1272 serialisation_method = 'html' 1273 for el in self: 1274 # it's rare that we actually get here, so let's not use ''.join() 1275 content += etree.tostring( 1276 el, method=serialisation_method, encoding='unicode') 1277 return content

1278 1279 @value.setter

1280 - def value(self, value):

1281 del self[:] 1282 self.text = value

1283 1284 @value.deleter

1285 - def value(self):

1286 self.text = '' 1287 del self[:]

1288 1289 1290 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement

1291 1292 1293 -class SelectElement(InputMixin, HtmlElement):

1294 """ 1295 ``<select>`` element. You can get the name with ``.name``. 1296 1297 ``.value`` will be the value of the selected option, unless this 1298 is a multi-select element (``<select multiple>``), in which case 1299 it will be a set-like object. In either case ``.value_options`` 1300 gives the possible values. 1301 1302 The boolean attribute ``.multiple`` shows if this is a 1303 multi-select. 1304 """ 1305 @property

1306 - def value(self):

1307 """ 1308 Get/set the value of this select (the selected option). 1309 1310 If this is a multi-select, this is a set-like object that 1311 represents all the selected options. 1312 """ 1313 if self.multiple: 1314 return MultipleSelectOptions(self) 1315 for el in _options_xpath(self): 1316 if el.get('selected') is not None: 1317 value = el.get('value') 1318 if value is None: 1319 value = el.text or '' 1320 if value: 1321 value = value.strip() 1322 return value 1323 return None

1324 1325 @value.setter

1326 - def value(self, value):

1327 if self.multiple: 1328 if isinstance(value, basestring): 1329 raise TypeError("You must pass in a sequence") 1330 values = self.value 1331 values.clear() 1332 values.update(value) 1333 return 1334 checked_option = None 1335 if value is not None: 1336 value = value.strip() 1337 for el in _options_xpath(self): 1338 opt_value = el.get('value') 1339 if opt_value is None: 1340 opt_value = el.text or '' 1341 if opt_value: 1342 opt_value = opt_value.strip() 1343 if opt_value == value: 1344 checked_option = el 1345 break 1346 else: 1347 raise ValueError( 1348 "There is no option with the value of %r" % value) 1349 for el in _options_xpath(self): 1350 if 'selected' in el.attrib: 1351 del el.attrib['selected'] 1352 if checked_option is not None: 1353 checked_option.set('selected', '')

1354 1355 @value.deleter

1356 - def value(self):

1357 # FIXME: should del be allowed at all? 1358 if self.multiple: 1359 self.value.clear() 1360 else: 1361 self.value = None

1362 1363 @property

1364 - def value_options(self):

1365 """ 1366 All the possible values this select can have (the ``value`` 1367 attribute of all the ``<option>`` elements. 1368 """ 1369 options = [] 1370 for el in _options_xpath(self): 1371 value = el.get('value') 1372 if value is None: 1373 value = el.text or '' 1374 if value: 1375 value = value.strip() 1376 options.append(value) 1377 return options

1378 1379 @property

1380 - def multiple(self):

1381 """ 1382 Boolean attribute: is there a ``multiple`` attribute on this element. 1383 """ 1384 return 'multiple' in self.attrib

1385 1386 @multiple.setter

1387 - def multiple(self, value):

1388 if value: 1389 self.set('multiple', '') 1390 elif 'multiple' in self.attrib: 1391 del self.attrib['multiple']

1392 1393 1394 HtmlElementClassLookup._default_element_classes['select'] = SelectElement

1395 1396 1397 -class MultipleSelectOptions(SetMixin):

1398 """ 1399 Represents all the selected options in a ``<select multiple>`` element. 1400 1401 You can add to this set-like option to select an option, or remove 1402 to unselect the option. 1403 """ 1404

1405 - def __init__(self, select):

1406 self.select = select

1407 1408 @property

1409 - def options(self):

1410 """ 1411 Iterator of all the ``<option>`` elements. 1412 """ 1413 return iter(_options_xpath(self.select))

1414

1415 - def __iter__(self):

1416 for option in self.options: 1417 if 'selected' in option.attrib: 1418 opt_value = option.get('value') 1419 if opt_value is None: 1420 opt_value = option.text or '' 1421 if opt_value: 1422 opt_value = opt_value.strip() 1423 yield opt_value

1424

1425 - def add(self, item):

1426 for option in self.options: 1427 opt_value = option.get('value') 1428 if opt_value is None: 1429 opt_value = option.text or '' 1430 if opt_value: 1431 opt_value = opt_value.strip() 1432 if opt_value == item: 1433 option.set('selected', '') 1434 break 1435 else: 1436 raise ValueError( 1437 "There is no option with the value %r" % item)

1438

1439 - def remove(self, item):

1440 for option in self.options: 1441 opt_value = option.get('value') 1442 if opt_value is None: 1443 opt_value = option.text or '' 1444 if opt_value: 1445 opt_value = opt_value.strip() 1446 if opt_value == item: 1447 if 'selected' in option.attrib: 1448 del option.attrib['selected'] 1449 else: 1450 raise ValueError( 1451 "The option %r is not currently selected" % item) 1452 break 1453 else: 1454 raise ValueError( 1455 "There is not option with the value %r" % item)

1456

1457 - def __repr__(self):

1458 return '<%s {%s} for select name=%r>' % ( 1459 self.__class__.__name__, 1460 ', '.join([repr(v) for v in self]), 1461 self.select.name)

1462

1463 1464 -class RadioGroup(list):

1465 """ 1466 This object represents several ``<input type=radio>`` elements 1467 that have the same name. 1468 1469 You can use this like a list, but also use the property 1470 ``.value`` to check/uncheck inputs. Also you can use 1471 ``.value_options`` to get the possible values. 1472 """ 1473 @property

1474 - def value(self):

1475 """ 1476 Get/set the value, which checks the radio with that value (and 1477 unchecks any other value). 1478 """ 1479 for el in self: 1480 if 'checked' in el.attrib: 1481 return el.get('value') 1482 return None

1483 1484 @value.setter

1485 - def value(self, value):

1486 checked_option = None 1487 if value is not None: 1488 for el in self: 1489 if el.get('value') == value: 1490 checked_option = el 1491 break 1492 else: 1493 raise ValueError("There is no radio input with the value %r" % value) 1494 for el in self: 1495 if 'checked' in el.attrib: 1496 del el.attrib['checked'] 1497 if checked_option is not None: 1498 checked_option.set('checked', '')

1499 1500 @value.deleter

1501 - def value(self):

1502 self.value = None

1503 1504 @property

1505 - def value_options(self):

1506 """ 1507 Returns a list of all the possible values. 1508 """ 1509 return [el.get('value') for el in self]

1510

1511 - def __repr__(self):

1512 return '%s(%s)' % ( 1513 self.__class__.__name__, 1514 list.__repr__(self))

1515

1516 1517 -class CheckboxGroup(list):

1518 """ 1519 Represents a group of checkboxes (``<input type=checkbox>``) that 1520 have the same name. 1521 1522 In addition to using this like a list, the ``.value`` attribute 1523 returns a set-like object that you can add to or remove from to 1524 check and uncheck checkboxes. You can also use ``.value_options`` 1525 to get the possible values. 1526 """ 1527 @property

1528 - def value(self):

1529 """ 1530 Return a set-like object that can be modified to check or 1531 uncheck individual checkboxes according to their value. 1532 """ 1533 return CheckboxValues(self)

1534 1535 @value.setter

1536 - def value(self, value):

1537 values = self.value 1538 values.clear() 1539 if not hasattr(value, '__iter__'): 1540 raise ValueError( 1541 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1542 % (self[0].name, value)) 1543 values.update(value)

1544 1545 @value.deleter

1546 - def value(self):

1547 self.value.clear()

1548 1549 @property

1550 - def value_options(self):

1551 """ 1552 Returns a list of all the possible values. 1553 """ 1554 return [el.get('value') for el in self]

1555

1556 - def __repr__(self):

1557 return '%s(%s)' % ( 1558 self.__class__.__name__, list.__repr__(self))

1559

1560 1561 -class CheckboxValues(SetMixin):

1562 """ 1563 Represents the values of the checked checkboxes in a group of 1564 checkboxes with the same name. 1565 """ 1566

1567 - def __init__(self, group):

1568 self.group = group

1569

1570 - def __iter__(self):

1571 return iter([ 1572 el.get('value') 1573 for el in self.group 1574 if 'checked' in el.attrib])

1575

1576 - def add(self, value):

1577 for el in self.group: 1578 if el.get('value') == value: 1579 el.set('checked', '') 1580 break 1581 else: 1582 raise KeyError("No checkbox with value %r" % value)

1583

1584 - def remove(self, value):

1585 for el in self.group: 1586 if el.get('value') == value: 1587 if 'checked' in el.attrib: 1588 del el.attrib['checked'] 1589 else: 1590 raise KeyError( 1591 "The checkbox with value %r was already unchecked" % value) 1592 break 1593 else: 1594 raise KeyError( 1595 "No checkbox with value %r" % value)

1596

1597 - def __repr__(self):

1598 return '<%s {%s} for checkboxes name=%r>' % ( 1599 self.__class__.__name__, 1600 ', '.join([repr(v) for v in self]), 1601 self.group.name)

1602

1603 1604 -class InputElement(InputMixin, HtmlElement):

1605 """ 1606 Represents an ``<input>`` element. 1607 1608 You can get the type with ``.type`` (which is lower-cased and 1609 defaults to ``'text'``). 1610 1611 Also you can get and set the value with ``.value`` 1612 1613 Checkboxes and radios have the attribute ``input.checkable == 1614 True`` (for all others it is false) and a boolean attribute 1615 ``.checked``. 1616 1617 """ 1618 1619 ## FIXME: I'm a little uncomfortable with the use of .checked 1620 @property

1621 - def value(self):

1622 """ 1623 Get/set the value of this element, using the ``value`` attribute. 1624 1625 Also, if this is a checkbox and it has no value, this defaults 1626 to ``'on'``. If it is a checkbox or radio that is not 1627 checked, this returns None. 1628 """ 1629 if self.checkable: 1630 if self.checked: 1631 return self.get('value') or 'on' 1632 else: 1633 return None 1634 return self.get('value')

1635 1636 @value.setter

1637 - def value(self, value):

1638 if self.checkable: 1639 if not value: 1640 self.checked = False 1641 else: 1642 self.checked = True 1643 if isinstance(value, basestring): 1644 self.set('value', value) 1645 else: 1646 self.set('value', value)

1647 1648 @value.deleter

1649 - def value(self):

1650 if self.checkable: 1651 self.checked = False 1652 else: 1653 if 'value' in self.attrib: 1654 del self.attrib['value']

1655 1656 @property

1657 - def type(self):

1658 """ 1659 Return the type of this element (using the type attribute). 1660 """ 1661 return self.get('type', 'text').lower()

1662 1663 @type.setter

1664 - def type(self, value):

1665 self.set('type', value)

1666 1667 @property

1668 - def checkable(self):

1669 """ 1670 Boolean: can this element be checked? 1671 """ 1672 return self.type in ('checkbox', 'radio')

1673 1674 @property

1675 - def checked(self):

1676 """ 1677 Boolean attribute to get/set the presence of the ``checked`` 1678 attribute. 1679 1680 You can only use this on checkable input types. 1681 """ 1682 if not self.checkable: 1683 raise AttributeError('Not a checkable input type') 1684 return 'checked' in self.attrib

1685 1686 @checked.setter

1687 - def checked(self, value):

1688 if not self.checkable: 1689 raise AttributeError('Not a checkable input type') 1690 if value: 1691 self.set('checked', '') 1692 else: 1693 attrib = self.attrib 1694 if 'checked' in attrib: 1695 del attrib['checked']

1696 1697 1698 HtmlElementClassLookup._default_element_classes['input'] = InputElement

1699 1700 1701 -class LabelElement(HtmlElement):

1702 """ 1703 Represents a ``<label>`` element. 1704 1705 Label elements are linked to other elements with their ``for`` 1706 attribute. You can access this element with ``label.for_element``. 1707 """ 1708 @property

1709 - def for_element(self):

1710 """ 1711 Get/set the element this label points to. Return None if it 1712 can't be found. 1713 """ 1714 id = self.get('for') 1715 if not id: 1716 return None 1717 return self.body.get_element_by_id(id)

1718 1719 @for_element.setter

1720 - def for_element(self, other):

1721 id = other.get('id') 1722 if not id: 1723 raise TypeError( 1724 "Element %r has no id attribute" % other) 1725 self.set('for', id)

1726 1727 @for_element.deleter

1728 - def for_element(self):

1729 attrib = self.attrib 1730 if 'id' in attrib: 1731 del attrib['id']

1732 1733 1734 HtmlElementClassLookup._default_element_classes['label'] = LabelElement

1735 1736 1737 ############################################################ 1738 ## Serialization 1739 ############################################################ 1740 1741 -def html_to_xhtml(html):

1742 """Convert all tags in an HTML tree to XHTML by moving them to the 1743 XHTML namespace. 1744 """ 1745 try: 1746 html = html.getroot() 1747 except AttributeError: 1748 pass 1749 prefix = "{%s}" % XHTML_NAMESPACE 1750 for el in html.iter(etree.Element): 1751 tag = el.tag 1752 if tag[0] != '{': 1753 el.tag = prefix + tag

1754

1755 1756 -def xhtml_to_html(xhtml):

1757 """Convert all tags in an XHTML tree to HTML by removing their 1758 XHTML namespace. 1759 """ 1760 try: 1761 xhtml = xhtml.getroot() 1762 except AttributeError: 1763 pass 1764 prefix = "{%s}" % XHTML_NAMESPACE 1765 prefix_len = len(prefix) 1766 for el in xhtml.iter(prefix + "*"): 1767 el.tag = el.tag[prefix_len:]

1768 1769 1770 # This isn't a general match, but it's a match for what libxml2 1771 # specifically serialises: 1772 __str_replace_meta_content_type = re.compile( 1773 r'<meta http-equiv="Content-Type"[^>]*>').sub 1774 __bytes_replace_meta_content_type = re.compile( 1775 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub

1776 1777 1778 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1779 encoding=None, method="html", with_tail=True, doctype=None):

1780 """Return an HTML string representation of the document. 1781 1782 Note: if include_meta_content_type is true this will create a 1783 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1784 regardless of the value of include_meta_content_type any existing 1785 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1786 1787 The ``encoding`` argument controls the output encoding (defauts to 1788 ASCII, with &#...; character references for any characters outside 1789 of ASCII). Note that you can pass the name ``'unicode'`` as 1790 ``encoding`` argument to serialise to a Unicode string. 1791 1792 The ``method`` argument defines the output method. It defaults to 1793 'html', but can also be 'xml' for xhtml output, or 'text' to 1794 serialise to plain text without markup. 1795 1796 To leave out the tail text of the top-level element that is being 1797 serialised, pass ``with_tail=False``. 1798 1799 The ``doctype`` option allows passing in a plain string that will 1800 be serialised before the XML tree. Note that passing in non 1801 well-formed content here will make the XML output non well-formed. 1802 Also, an existing doctype in the document tree will not be removed 1803 when serialising an ElementTree instance. 1804 1805 Example:: 1806 1807 >>> from lxml import html 1808 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1809 1810 >>> html.tostring(root) 1811 b'<p>Hello<br>world!</p>' 1812 >>> html.tostring(root, method='html') 1813 b'<p>Hello<br>world!</p>' 1814 1815 >>> html.tostring(root, method='xml') 1816 b'<p>Hello<br/>world!</p>' 1817 1818 >>> html.tostring(root, method='text') 1819 b'Helloworld!' 1820 1821 >>> html.tostring(root, method='text', encoding='unicode') 1822 u'Helloworld!' 1823 1824 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1825 >>> html.tostring(root[0], method='text', encoding='unicode') 1826 u'Helloworld!TAIL' 1827 1828 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1829 u'Helloworld!' 1830 1831 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1832 >>> html.tostring(doc, method='html', encoding='unicode') 1833 u'<html><body><p>Hello<br>world!</p></body></html>' 1834 1835 >>> print(html.tostring(doc, method='html', encoding='unicode', 1836 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1837 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1838 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1839 <html><body><p>Hello<br>world!</p></body></html> 1840 """ 1841 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1842 encoding=encoding, with_tail=with_tail, 1843 doctype=doctype) 1844 if method == 'html' and not include_meta_content_type: 1845 if isinstance(html, str): 1846 html = __str_replace_meta_content_type('', html) 1847 else: 1848 html = __bytes_replace_meta_content_type(bytes(), html) 1849 return html

1850 1851 1852 tostring.__doc__ = __fix_docstring(tostring.__doc__)

1853 1854 1855 -def open_in_browser(doc, encoding=None):

1856 """ 1857 Open the HTML document in a web browser, saving it to a temporary 1858 file to open it. Note that this does not delete the file after 1859 use. This is mainly meant for debugging. 1860 """ 1861 import os 1862 import webbrowser 1863 import tempfile 1864 if not isinstance(doc, etree._ElementTree): 1865 doc = etree.ElementTree(doc) 1866 handle, fn = tempfile.mkstemp(suffix='.html') 1867 f = os.fdopen(handle, 'wb') 1868 try: 1869 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1870 finally: 1871 # we leak the file itself here, but we should at least close it 1872 f.close() 1873 url = 'file://' + fn.replace(os.path.sep, '/') 1874 print(url) 1875 webbrowser.open(url)

1876

1877 1878 ################################################################################ 1879 # configure Element class lookup 1880 ################################################################################ 1881 1882 -class HTMLParser(etree.HTMLParser):

1883 """An HTML parser that is configured to return lxml.html Element 1884 objects. 1885 """

1886 - def __init__(self, **kwargs):

1887 super(HTMLParser, self).__init__(**kwargs) 1888 self.set_element_class_lookup(HtmlElementClassLookup())

1889

1890 1891 -class XHTMLParser(etree.XMLParser):

1892 """An XML parser that is configured to return lxml.html Element 1893 objects. 1894 1895 Note that this parser is not really XHTML aware unless you let it 1896 load a DTD that declares the HTML entities. To do this, make sure 1897 you have the XHTML DTDs installed in your catalogs, and create the 1898 parser like this:: 1899 1900 >>> parser = XHTMLParser(load_dtd=True) 1901 1902 If you additionally want to validate the document, use this:: 1903 1904 >>> parser = XHTMLParser(dtd_validation=True) 1905 1906 For catalog support, see http://www.xmlsoft.org/catalog.html. 1907 """

1908 - def __init__(self, **kwargs):

1909 super(XHTMLParser, self).__init__(**kwargs) 1910 self.set_element_class_lookup(HtmlElementClassLookup())

1911

1912 1913 -def Element(*args, **kw):

1914 """Create a new HTML Element. 1915 1916 This can also be used for XHTML documents. 1917 """ 1918 v = html_parser.makeelement(*args, **kw) 1919 return v

1920 1921 1922 html_parser = HTMLParser() 1923 xhtml_parser = XHTMLParser() 1924

Source Code for Package lxml.html