lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 import sys 35 import re 36 try: 37 from urlparse import urljoin 38 except ImportError: 39 # Python 3 40 from urllib.parse import urljoin 41 import copy 42 from lxml import etree 43 from lxml.html import defs 44 from lxml.html._setmixin import SetMixin 45 try: 46 from collections import MutableMapping as DictMixin 47 except ImportError: 48 # Python < 2.6 49 from UserDict import DictMixin 50 try: 51 set 52 except NameError: 53 # Python 2.3 54 from sets import Set as set 55 try: 56 bytes 57 except NameError: 58 # Python < 2.6 59 bytes = str 60 try: 61 unicode 62 except NameError: 63 # Python 3 64 unicode = str 65 try: 66 basestring 67 except NameError: 68 # Python 3 69 basestring = (str, bytes) 70

71 -def __fix_docstring(s):

72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)

80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) 100 _css_import_re = re.compile(r'@import "(.*?)"') 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104

105 -def _unquote_match(s, pos):

106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos

110

111 -def _transform_result(typ, result):

112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding='unicode') 118 else: 119 return result

120

121 -def _nons(tag):

122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag

126

127 -class HtmlMixin(object):

128

129 - def base_url(self):

130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL

137 base_url = property(base_url, doc=base_url.__doc__) 138

139 - def forms(self):

140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)

144 forms = property(forms, doc=forms.__doc__) 145

146 - def body(self):

147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

152 body = property(body, doc=body.__doc__) 153

154 - def head(self):

155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

160 head = property(head, doc=head.__doc__) 161

162 - def _label__get(self):

163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]

174 - def _label__set(self, label):

175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)

185 - def _label__del(self):

186 label = self.label 187 if label is not None: 188 del label.attrib['for']

189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190

191 - def drop_tree(self):

192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)

206

207 - def drop_tag(self):

208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding='unicode')) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]

238

239 - def find_rel_links(self, rel):

240 """ 241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 242 """ 243 rel = rel.lower() 244 return [el for el in _rel_links_xpath(self) 245 if el.get('rel').lower() == rel]

246

247 - def find_class(self, class_name):

248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)

252

253 - def get_element_by_id(self, id, *default):

254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)

273

274 - def text_content(self):

275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)

279

280 - def cssselect(self, expr, translator='html'):

281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)

292 293 ######################################## 294 ## Link functions 295 ######################################## 296

297 - def make_links_absolute(self, base_url=None, resolve_base_href=True, 298 handle_failures=None):

299 """ 300 Make all links in the document absolute, given the 301 ``base_url`` for the document (the full URL where the document 302 came from), or if no ``base_url`` is given, then the ``.base_url`` 303 of the document. 304 305 If ``resolve_base_href`` is true, then any ``<base href>`` 306 tags in the document are used *and* removed from the document. 307 If it is false then any such tag is ignored. 308 309 If ``handle_failures`` is None (default), a failure to process 310 a URL will abort the processing. If set to 'ignore', errors 311 are ignored. If set to 'discard', failing URLs will be removed. 312 """ 313 if base_url is None: 314 base_url = self.base_url 315 if base_url is None: 316 raise TypeError( 317 "No base_url given, and the document has no base_url") 318 if resolve_base_href: 319 self.resolve_base_href() 320 321 if handle_failures == 'ignore': 322 def link_repl(href): 323 try: 324 return urljoin(base_url, href) 325 except ValueError: 326 return href

327 elif handle_failures == 'discard': 328 def link_repl(href): 329 try: 330 return urljoin(base_url, href) 331 except ValueError: 332 return None

333 elif handle_failures is None: 334 def link_repl(href): 335 return urljoin(base_url, href) 336 else: 337 raise ValueError( 338 "unexpected value for handle_failures: %r" % handle_failures) 339 340 self.rewrite_links(link_repl) 341

342 - def resolve_base_href(self, handle_failures=None):

343 """ 344 Find any ``<base href>`` tag in the document, and apply its 345 values to all links found in the document. Also remove the 346 tag once it has been applied. 347 348 If ``handle_failures`` is None (default), a failure to process 349 a URL will abort the processing. If set to 'ignore', errors 350 are ignored. If set to 'discard', failing URLs will be removed. 351 """ 352 base_href = None 353 basetags = self.xpath('//base[@href]|//x:base[@href]', 354 namespaces={'x': XHTML_NAMESPACE}) 355 for b in basetags: 356 base_href = b.get('href') 357 b.drop_tree() 358 if not base_href: 359 return 360 self.make_links_absolute(base_href, resolve_base_href=False, 361 handle_failures=handle_failures)

362

363 - def iterlinks(self):

364 """ 365 Yield (element, attribute, link, pos), where attribute may be None 366 (indicating the link is in the text). ``pos`` is the position 367 where the link occurs; often 0, but sometimes something else in 368 the case of links in stylesheets or style tags. 369 370 Note: <base href> is *not* taken into account in any way. The 371 link you get is exactly the link in the document. 372 373 Note: multiple links inside of a single text string or 374 attribute value are returned in reversed order. This makes it 375 possible to replace or delete them from the text string value 376 based on their reported text positions. Otherwise, a 377 modification at one text position can change the positions of 378 links reported later on. 379 """ 380 link_attrs = defs.link_attrs 381 for el in self.iter(etree.Element): 382 attribs = el.attrib 383 tag = _nons(el.tag) 384 if tag != 'object': 385 for attrib in link_attrs: 386 if attrib in attribs: 387 yield (el, attrib, attribs[attrib], 0) 388 elif tag == 'object': 389 codebase = None 390 ## <object> tags have attributes that are relative to 391 ## codebase 392 if 'codebase' in attribs: 393 codebase = el.get('codebase') 394 yield (el, 'codebase', codebase, 0) 395 for attrib in 'classid', 'data': 396 if attrib in attribs: 397 value = el.get(attrib) 398 if codebase is not None: 399 value = urljoin(codebase, value) 400 yield (el, attrib, value, 0) 401 if 'archive' in attribs: 402 for match in _archive_re.finditer(el.get('archive')): 403 value = match.group(0) 404 if codebase is not None: 405 value = urljoin(codebase, value) 406 yield (el, 'archive', value, match.start()) 407 if tag == 'param': 408 valuetype = el.get('valuetype') or '' 409 if valuetype.lower() == 'ref': 410 ## FIXME: while it's fine we *find* this link, 411 ## according to the spec we aren't supposed to 412 ## actually change the value, including resolving 413 ## it. It can also still be a link, even if it 414 ## doesn't have a valuetype="ref" (which seems to be the norm) 415 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 416 yield (el, 'value', el.get('value'), 0) 417 if tag == 'style' and el.text: 418 urls = [ 419 _unquote_match(match.group(1), match.start(1)) 420 for match in _css_url_re.finditer(el.text) 421 ] + [ 422 (match.group(1), match.start(1)) 423 for match in _css_import_re.finditer(el.text) 424 ] 425 if urls: 426 # sort by start pos to bring both match sets back into order 427 urls = [ (start, url) for (url, start) in urls ] 428 urls.sort() 429 # reverse the list to report correct positions despite 430 # modifications 431 urls.reverse() 432 for start, url in urls: 433 yield (el, None, url, start) 434 if 'style' in attribs: 435 urls = list(_css_url_re.finditer(attribs['style'])) 436 if urls: 437 # return in reversed order to simplify in-place modifications 438 for match in urls[::-1]: 439 url, start = _unquote_match(match.group(1), match.start(1)) 440 yield (el, 'style', url, start)

441

442 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 443 base_href=None):

444 """ 445 Rewrite all the links in the document. For each link 446 ``link_repl_func(link)`` will be called, and the return value 447 will replace the old link. 448 449 Note that links may not be absolute (unless you first called 450 ``make_links_absolute()``), and may be internal (e.g., 451 ``'#anchor'``). They can also be values like 452 ``'mailto:email'`` or ``'javascript:expr'``. 453 454 If you give ``base_href`` then all links passed to 455 ``link_repl_func()`` will take that into account. 456 457 If the ``link_repl_func`` returns None, the attribute or 458 tag text will be removed completely. 459 """ 460 if base_href is not None: 461 # FIXME: this can be done in one pass with a wrapper 462 # around link_repl_func 463 self.make_links_absolute( 464 base_href, resolve_base_href=resolve_base_href) 465 elif resolve_base_href: 466 self.resolve_base_href() 467 468 for el, attrib, link, pos in self.iterlinks(): 469 new_link = link_repl_func(link.strip()) 470 if new_link == link: 471 continue 472 if new_link is None: 473 # Remove the attribute or element content 474 if attrib is None: 475 el.text = '' 476 else: 477 del el.attrib[attrib] 478 continue 479 480 if attrib is None: 481 new = el.text[:pos] + new_link + el.text[pos+len(link):] 482 el.text = new 483 else: 484 cur = el.get(attrib) 485 if not pos and len(cur) == len(link): 486 new = new_link # most common case 487 else: 488 new = cur[:pos] + new_link + cur[pos+len(link):] 489 el.set(attrib, new)

490 491

492 -class _MethodFunc(object):

493 """ 494 An object that represents a method on an element as a function; 495 the function takes either an element or an HTML string. It 496 returns whatever the function normally returns, or if the function 497 works in-place (and so returns None) it returns a serialized form 498 of the resulting document. 499 """

500 - def __init__(self, name, copy=False, source_class=HtmlMixin):

501 self.name = name 502 self.copy = copy 503 self.__doc__ = getattr(source_class, self.name).__doc__

504 - def __call__(self, doc, *args, **kw):

505 result_type = type(doc) 506 if isinstance(doc, basestring): 507 if 'copy' in kw: 508 raise TypeError( 509 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 510 doc = fromstring(doc, **kw) 511 else: 512 if 'copy' in kw: 513 make_a_copy = kw.pop('copy') 514 else: 515 make_a_copy = self.copy 516 if make_a_copy: 517 doc = copy.deepcopy(doc) 518 meth = getattr(doc, self.name) 519 result = meth(*args, **kw) 520 # FIXME: this None test is a bit sloppy 521 if result is None: 522 # Then return what we got in 523 return _transform_result(result_type, doc) 524 else: 525 return result

526 527 find_rel_links = _MethodFunc('find_rel_links', copy=False) 528 find_class = _MethodFunc('find_class', copy=False) 529 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 530 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 531 iterlinks = _MethodFunc('iterlinks', copy=False) 532 rewrite_links = _MethodFunc('rewrite_links', copy=True) 533

534 -class HtmlComment(etree.CommentBase, HtmlMixin):

535 pass

536

537 -class HtmlElement(etree.ElementBase, HtmlMixin):

538 pass

539

540 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

541 pass

542

543 -class HtmlEntity(etree.EntityBase, HtmlMixin):

544 pass

545 546

547 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

548 """A lookup scheme for HTML Element classes. 549 550 To create a lookup instance with different Element classes, pass a tag 551 name mapping of Element classes in the ``classes`` keyword argument and/or 552 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 553 The special key '*' denotes a Mixin class that should be mixed into all 554 Element classes. 555 """ 556 _default_element_classes = {} 557

558 - def __init__(self, classes=None, mixins=None):

559 etree.CustomElementClassLookup.__init__(self) 560 if classes is None: 561 classes = self._default_element_classes.copy() 562 if mixins: 563 mixers = {} 564 for name, value in mixins: 565 if name == '*': 566 for n in classes.keys(): 567 mixers.setdefault(n, []).append(value) 568 else: 569 mixers.setdefault(name, []).append(value) 570 for name, mix_bases in mixers.items(): 571 cur = classes.get(name, HtmlElement) 572 bases = tuple(mix_bases + [cur]) 573 classes[name] = type(cur.__name__, bases, {}) 574 self._element_classes = classes

575

576 - def lookup(self, node_type, document, namespace, name):

577 if node_type == 'element': 578 return self._element_classes.get(name.lower(), HtmlElement) 579 elif node_type == 'comment': 580 return HtmlComment 581 elif node_type == 'PI': 582 return HtmlProcessingInstruction 583 elif node_type == 'entity': 584 return HtmlEntity 585 # Otherwise normal lookup 586 return None

587 588 ################################################################################ 589 # parsing 590 ################################################################################ 591 592 _looks_like_full_html_unicode = re.compile( 593 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 594 _looks_like_full_html_bytes = re.compile( 595 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 596

597 -def document_fromstring(html, parser=None, **kw):

598 if parser is None: 599 parser = html_parser 600 value = etree.fromstring(html, parser, **kw) 601 if value is None: 602 raise etree.ParserError( 603 "Document is empty") 604 return value

605

606 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 607 parser=None, **kw):

608 """ 609 Parses several HTML elements, returning a list of elements. 610 611 The first item in the list may be a string (though leading 612 whitespace is removed). If no_leading_text is true, then it will 613 be an error if there is leading text, and it will always be a list 614 of only elements. 615 616 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 617 """ 618 if parser is None: 619 parser = html_parser 620 # FIXME: check what happens when you give html with a body, head, etc. 621 if isinstance(html, bytes): 622 if not _looks_like_full_html_bytes(html): 623 # can't use %-formatting in early Py3 versions 624 html = ('<html><body>'.encode('ascii') + html + 625 '</body></html>'.encode('ascii')) 626 else: 627 if not _looks_like_full_html_unicode(html): 628 html = '<html><body>%s</body></html>' % html 629 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 630 assert _nons(doc.tag) == 'html' 631 bodies = [e for e in doc if _nons(e.tag) == 'body'] 632 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 633 body = bodies[0] 634 elements = [] 635 if no_leading_text and body.text and body.text.strip(): 636 raise etree.ParserError( 637 "There is leading text: %r" % body.text) 638 if body.text and body.text.strip(): 639 elements.append(body.text) 640 elements.extend(body) 641 # FIXME: removing the reference to the parent artificial document 642 # would be nice 643 return elements

644

645 -def fragment_fromstring(html, create_parent=False, base_url=None, 646 parser=None, **kw):

647 """ 648 Parses a single HTML element; it is an error if there is more than 649 one element, or if anything but whitespace precedes or follows the 650 element. 651 652 If create_parent is true (or is a tag name) then a parent node 653 will be created to encapsulate the HTML in a single element. In 654 this case, leading or trailing text is allowed. 655 656 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 657 """ 658 if parser is None: 659 parser = html_parser 660 661 accept_leading_text = bool(create_parent) 662 663 elements = fragments_fromstring( 664 html, parser=parser, no_leading_text=not accept_leading_text, 665 base_url=base_url, **kw) 666 667 if create_parent: 668 if not isinstance(create_parent, basestring): 669 create_parent = 'div' 670 new_root = Element(create_parent) 671 if elements: 672 if isinstance(elements[0], basestring): 673 new_root.text = elements[0] 674 del elements[0] 675 new_root.extend(elements) 676 return new_root 677 678 if not elements: 679 raise etree.ParserError('No elements found') 680 if len(elements) > 1: 681 raise etree.ParserError( 682 "Multiple elements found (%s)" 683 % ', '.join([_element_name(e) for e in elements])) 684 el = elements[0] 685 if el.tail and el.tail.strip(): 686 raise etree.ParserError( 687 "Element followed by text: %r" % el.tail) 688 el.tail = None 689 return el

690

691 -def fromstring(html, base_url=None, parser=None, **kw):

692 """ 693 Parse the html, returning a single element/document. 694 695 This tries to minimally parse the chunk of text, without knowing if it 696 is a fragment or a document. 697 698 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 699 """ 700 if parser is None: 701 parser = html_parser 702 if isinstance(html, bytes): 703 is_full_html = _looks_like_full_html_bytes(html) 704 else: 705 is_full_html = _looks_like_full_html_unicode(html) 706 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 707 if is_full_html: 708 return doc 709 # otherwise, lets parse it out... 710 bodies = doc.findall('body') 711 if not bodies: 712 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 713 if bodies: 714 body = bodies[0] 715 if len(bodies) > 1: 716 # Somehow there are multiple bodies, which is bad, but just 717 # smash them into one body 718 for other_body in bodies[1:]: 719 if other_body.text: 720 if len(body): 721 body[-1].tail = (body[-1].tail or '') + other_body.text 722 else: 723 body.text = (body.text or '') + other_body.text 724 body.extend(other_body) 725 # We'll ignore tail 726 # I guess we are ignoring attributes too 727 other_body.drop_tree() 728 else: 729 body = None 730 heads = doc.findall('head') 731 if not heads: 732 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 733 if heads: 734 # Well, we have some sort of structure, so lets keep it all 735 head = heads[0] 736 if len(heads) > 1: 737 for other_head in heads[1:]: 738 head.extend(other_head) 739 # We don't care about text or tail in a head 740 other_head.drop_tree() 741 return doc 742 if body is None: 743 return doc 744 if (len(body) == 1 and (not body.text or not body.text.strip()) 745 and (not body[-1].tail or not body[-1].tail.strip())): 746 # The body has just one element, so it was probably a single 747 # element passed in 748 return body[0] 749 # Now we have a body which represents a bunch of tags which have the 750 # content that was passed in. We will create a fake container, which 751 # is the body tag, except <body> implies too much structure. 752 if _contains_block_level_tag(body): 753 body.tag = 'div' 754 else: 755 body.tag = 'span' 756 return body

757

758 -def parse(filename_or_url, parser=None, base_url=None, **kw):

759 """ 760 Parse a filename, URL, or file-like object into an HTML document 761 tree. Note: this returns a tree, not an element. Use 762 ``parse(...).getroot()`` to get the document root. 763 764 You can override the base URL with the ``base_url`` keyword. This 765 is most useful when parsing from a file-like object. 766 """ 767 if parser is None: 768 parser = html_parser 769 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

770

771 -def _contains_block_level_tag(el):

772 # FIXME: I could do this with XPath, but would that just be 773 # unnecessarily slow? 774 for el in el.iter(etree.Element): 775 if _nons(el.tag) in defs.block_tags: 776 return True 777 return False

778

779 -def _element_name(el):

780 if isinstance(el, etree.CommentBase): 781 return 'comment' 782 elif isinstance(el, basestring): 783 return 'string' 784 else: 785 return _nons(el.tag)

786 787 ################################################################################ 788 # form handling 789 ################################################################################ 790

791 -class FormElement(HtmlElement):

792 """ 793 Represents a <form> element. 794 """ 795

796 - def inputs(self):

797 """ 798 Returns an accessor for all the input elements in the form. 799 800 See `InputGetter` for more information about the object. 801 """ 802 return InputGetter(self)

803 inputs = property(inputs, doc=inputs.__doc__) 804

805 - def _fields__get(self):

806 """ 807 Dictionary-like object that represents all the fields in this 808 form. You can set values in this dictionary to effect the 809 form. 810 """ 811 return FieldsDict(self.inputs)

812 - def _fields__set(self, value):

813 prev_keys = self.fields.keys() 814 for key, value in value.items(): 815 if key in prev_keys: 816 prev_keys.remove(key) 817 self.fields[key] = value 818 for key in prev_keys: 819 if key is None: 820 # Case of an unnamed input; these aren't really 821 # expressed in form_values() anyway. 822 continue 823 self.fields[key] = None

824 825 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 826

827 - def _name(self):

828 if self.get('name'): 829 return self.get('name') 830 elif self.get('id'): 831 return '#' + self.get('id') 832 forms = list(self.body.iter('form')) 833 if not forms: 834 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 835 return str(forms.index(self))

836

837 - def form_values(self):

838 """ 839 Return a list of tuples of the field values for the form. 840 This is suitable to be passed to ``urllib.urlencode()``. 841 """ 842 results = [] 843 for el in self.inputs: 844 name = el.name 845 if not name: 846 continue 847 tag = _nons(el.tag) 848 if tag == 'textarea': 849 results.append((name, el.value)) 850 elif tag == 'select': 851 value = el.value 852 if el.multiple: 853 for v in value: 854 results.append((name, v)) 855 elif value is not None: 856 results.append((name, el.value)) 857 else: 858 assert tag == 'input', ( 859 "Unexpected tag: %r" % el) 860 if el.checkable and not el.checked: 861 continue 862 if el.type in ('submit', 'image', 'reset'): 863 continue 864 value = el.value 865 if value is not None: 866 results.append((name, el.value)) 867 return results

868

869 - def _action__get(self):

870 """ 871 Get/set the form's ``action`` attribute. 872 """ 873 base_url = self.base_url 874 action = self.get('action') 875 if base_url and action is not None: 876 return urljoin(base_url, action) 877 else: 878 return action

879 - def _action__set(self, value):

880 self.set('action', value)

881 - def _action__del(self):

882 if 'action' in self.attrib: 883 del self.attrib['action']

884 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 885

886 - def _method__get(self):

887 """ 888 Get/set the form's method. Always returns a capitalized 889 string, and defaults to ``'GET'`` 890 """ 891 return self.get('method', 'GET').upper()

892 - def _method__set(self, value):

893 self.set('method', value.upper())

894 method = property(_method__get, _method__set, doc=_method__get.__doc__)

895 896 HtmlElementClassLookup._default_element_classes['form'] = FormElement 897

898 -def submit_form(form, extra_values=None, open_http=None):

899 """ 900 Helper function to submit a form. Returns a file-like object, as from 901 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 902 which shows the URL if there were any redirects. 903 904 You can use this like:: 905 906 form = doc.forms[0] 907 form.inputs['foo'].value = 'bar' # etc 908 response = form.submit() 909 doc = parse(response) 910 doc.make_links_absolute(response.geturl()) 911 912 To change the HTTP requester, pass a function as ``open_http`` keyword 913 argument that opens the URL for you. The function must have the following 914 signature:: 915 916 open_http(method, URL, values) 917 918 The action is one of 'GET' or 'POST', the URL is the target URL as a 919 string, and the values are a sequence of ``(name, value)`` tuples with the 920 form data. 921 """ 922 values = form.form_values() 923 if extra_values: 924 if hasattr(extra_values, 'items'): 925 extra_values = extra_values.items() 926 values.extend(extra_values) 927 if open_http is None: 928 open_http = open_http_urllib 929 if form.action: 930 url = form.action 931 else: 932 url = form.base_url 933 return open_http(form.method, url, values)

934

935 -def open_http_urllib(method, url, values):

936 if not url: 937 raise ValueError("cannot submit, no URL provided") 938 ## FIXME: should test that it's not a relative URL or something 939 try: 940 from urllib import urlencode, urlopen 941 except ImportError: # Python 3 942 from urllib.request import urlopen 943 from urllib.parse import urlencode 944 if method == 'GET': 945 if '?' in url: 946 url += '&' 947 else: 948 url += '?' 949 url += urlencode(values) 950 data = None 951 else: 952 data = urlencode(values) 953 return urlopen(url, data)

954

955 -class FieldsDict(DictMixin):

956

957 - def __init__(self, inputs):

958 self.inputs = inputs

959 - def __getitem__(self, item):

960 return self.inputs[item].value

961 - def __setitem__(self, item, value):

962 self.inputs[item].value = value

963 - def __delitem__(self, item):

964 raise KeyError( 965 "You cannot remove keys from ElementDict")

966 - def keys(self):

967 return self.inputs.keys()

968 - def __contains__(self, item):

969 return item in self.inputs

970 - def __iter__(self):

971 return iter(self.inputs.keys())

972 - def __len__(self):

973 return len(self.inputs)

974

975 - def __repr__(self):

976 return '<%s for form %s>' % ( 977 self.__class__.__name__, 978 self.inputs.form._name())

979

980 -class InputGetter(object):

981 982 """ 983 An accessor that represents all the input fields in a form. 984 985 You can get fields by name from this, with 986 ``form.inputs['field_name']``. If there are a set of checkboxes 987 with the same name, they are returned as a list (a `CheckboxGroup` 988 which also allows value setting). Radio inputs are handled 989 similarly. 990 991 You can also iterate over this to get all input elements. This 992 won't return the same thing as if you get all the names, as 993 checkboxes and radio elements are returned individually. 994 """ 995 996 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 997 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 998

999 - def __init__(self, form):

1000 self.form = form

1001

1002 - def __repr__(self):

1003 return '<%s for form %s>' % ( 1004 self.__class__.__name__, 1005 self.form._name())

1006 1007 ## FIXME: there should be more methods, and it's unclear if this is 1008 ## a dictionary-like object or list-like object 1009

1010 - def __getitem__(self, name):

1011 results = self._name_xpath(self.form, name=name) 1012 if results: 1013 type = results[0].get('type') 1014 if type == 'radio' and len(results) > 1: 1015 group = RadioGroup(results) 1016 group.name = name 1017 return group 1018 elif type == 'checkbox' and len(results) > 1: 1019 group = CheckboxGroup(results) 1020 group.name = name 1021 return group 1022 else: 1023 # I don't like throwing away elements like this 1024 return results[0] 1025 else: 1026 raise KeyError( 1027 "No input element with the name %r" % name)

1028

1029 - def __contains__(self, name):

1030 results = self._name_xpath(self.form, name=name) 1031 return bool(results)

1032

1033 - def keys(self):

1034 names = set() 1035 for el in self: 1036 names.add(el.name) 1037 if None in names: 1038 names.remove(None) 1039 return list(names)

1040

1041 - def __iter__(self):

1042 ## FIXME: kind of dumb to turn a list into an iterator, only 1043 ## to have it likely turned back into a list again :( 1044 return iter(self._all_xpath(self.form))

1045

1046 -class InputMixin(object):

1047 1048 """ 1049 Mix-in for all input elements (input, select, and textarea) 1050 """ 1051 1052

1053 - def _name__get(self):

1054 """ 1055 Get/set the name of the element 1056 """ 1057 return self.get('name')

1058 - def _name__set(self, value):

1059 self.set('name', value)

1060 - def _name__del(self):

1061 if 'name' in self.attrib: 1062 del self.attrib['name']

1063 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1064

1065 - def __repr__(self):

1066 type = getattr(self, 'type', None) 1067 if type: 1068 type = ' type=%r' % type 1069 else: 1070 type = '' 1071 return '<%s %x name=%r%s>' % ( 1072 self.__class__.__name__, id(self), self.name, type)

1073

1074 -class TextareaElement(InputMixin, HtmlElement):

1075 """ 1076 ``<textarea>`` element. You can get the name with ``.name`` and 1077 get/set the value with ``.value`` 1078 """ 1079

1080 - def _value__get(self):

1081 """ 1082 Get/set the value (which is the contents of this element) 1083 """ 1084 content = self.text or '' 1085 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1086 serialisation_method = 'xml' 1087 else: 1088 serialisation_method = 'html' 1089 for el in self: 1090 # it's rare that we actually get here, so let's not use ''.join() 1091 content += etree.tostring( 1092 el, method=serialisation_method, encoding='unicode') 1093 return content

1094 - def _value__set(self, value):

1095 del self[:] 1096 self.text = value

1097 - def _value__del(self):

1098 self.text = '' 1099 del self[:]

1100 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

1101 1102 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1103

1104 -class SelectElement(InputMixin, HtmlElement):

1105 """ 1106 ``<select>`` element. You can get the name with ``.name``. 1107 1108 ``.value`` will be the value of the selected option, unless this 1109 is a multi-select element (``<select multiple>``), in which case 1110 it will be a set-like object. In either case ``.value_options`` 1111 gives the possible values. 1112 1113 The boolean attribute ``.multiple`` shows if this is a 1114 multi-select. 1115 """ 1116

1117 - def _value__get(self):

1118 """ 1119 Get/set the value of this select (the selected option). 1120 1121 If this is a multi-select, this is a set-like object that 1122 represents all the selected options. 1123 """ 1124 if self.multiple: 1125 return MultipleSelectOptions(self) 1126 for el in _options_xpath(self): 1127 if el.get('selected') is not None: 1128 value = el.get('value') 1129 if value is None: 1130 value = el.text or '' 1131 if value: 1132 value = value.strip() 1133 return value 1134 return None

1135

1136 - def _value__set(self, value):

1137 if self.multiple: 1138 if isinstance(value, basestring): 1139 raise TypeError( 1140 "You must pass in a sequence") 1141 self.value.clear() 1142 self.value.update(value) 1143 return 1144 if value is not None: 1145 value = value.strip() 1146 for el in _options_xpath(self): 1147 opt_value = el.get('value') 1148 if opt_value is None: 1149 opt_value = el.text or '' 1150 if opt_value: 1151 opt_value = opt_value.strip() 1152 if opt_value == value: 1153 checked_option = el 1154 break 1155 else: 1156 raise ValueError( 1157 "There is no option with the value of %r" % value) 1158 for el in _options_xpath(self): 1159 if 'selected' in el.attrib: 1160 del el.attrib['selected'] 1161 if value is not None: 1162 checked_option.set('selected', '')

1163

1164 - def _value__del(self):

1165 # FIXME: should del be allowed at all? 1166 if self.multiple: 1167 self.value.clear() 1168 else: 1169 self.value = None

1170 1171 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1172

1173 - def value_options(self):

1174 """ 1175 All the possible values this select can have (the ``value`` 1176 attribute of all the ``<option>`` elements. 1177 """ 1178 options = [] 1179 for el in _options_xpath(self): 1180 value = el.get('value') 1181 if value is None: 1182 value = el.text or '' 1183 if value: 1184 value = value.strip() 1185 options.append(value) 1186 return options

1187 value_options = property(value_options, doc=value_options.__doc__) 1188

1189 - def _multiple__get(self):

1190 """ 1191 Boolean attribute: is there a ``multiple`` attribute on this element. 1192 """ 1193 return 'multiple' in self.attrib

1194 - def _multiple__set(self, value):

1195 if value: 1196 self.set('multiple', '') 1197 elif 'multiple' in self.attrib: 1198 del self.attrib['multiple']

1199 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1200 1201 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1202

1203 -class MultipleSelectOptions(SetMixin):

1204 """ 1205 Represents all the selected options in a ``<select multiple>`` element. 1206 1207 You can add to this set-like option to select an option, or remove 1208 to unselect the option. 1209 """ 1210

1211 - def __init__(self, select):

1212 self.select = select

1213

1214 - def options(self):

1215 """ 1216 Iterator of all the ``<option>`` elements. 1217 """ 1218 return iter(_options_xpath(self.select))

1219 options = property(options) 1220

1221 - def __iter__(self):

1222 for option in self.options: 1223 if 'selected' in option.attrib: 1224 opt_value = option.get('value') 1225 if opt_value is None: 1226 opt_value = option.text or '' 1227 if opt_value: 1228 opt_value = opt_value.strip() 1229 yield opt_value

1230

1231 - def add(self, item):

1232 for option in self.options: 1233 opt_value = option.get('value') 1234 if opt_value is None: 1235 opt_value = option.text or '' 1236 if opt_value: 1237 opt_value = opt_value.strip() 1238 if opt_value == item: 1239 option.set('selected', '') 1240 break 1241 else: 1242 raise ValueError( 1243 "There is no option with the value %r" % item)

1244

1245 - def remove(self, item):

1246 for option in self.options: 1247 opt_value = option.get('value') 1248 if opt_value is None: 1249 opt_value = option.text or '' 1250 if opt_value: 1251 opt_value = opt_value.strip() 1252 if opt_value == item: 1253 if 'selected' in option.attrib: 1254 del option.attrib['selected'] 1255 else: 1256 raise ValueError( 1257 "The option %r is not currently selected" % item) 1258 break 1259 else: 1260 raise ValueError( 1261 "There is not option with the value %r" % item)

1262

1263 - def __repr__(self):

1264 return '<%s {%s} for select name=%r>' % ( 1265 self.__class__.__name__, 1266 ', '.join([repr(v) for v in self]), 1267 self.select.name)

1268

1269 -class RadioGroup(list):

1270 """ 1271 This object represents several ``<input type=radio>`` elements 1272 that have the same name. 1273 1274 You can use this like a list, but also use the property 1275 ``.value`` to check/uncheck inputs. Also you can use 1276 ``.value_options`` to get the possible values. 1277 """ 1278

1279 - def _value__get(self):

1280 """ 1281 Get/set the value, which checks the radio with that value (and 1282 unchecks any other value). 1283 """ 1284 for el in self: 1285 if 'checked' in el.attrib: 1286 return el.get('value') 1287 return None

1288

1289 - def _value__set(self, value):

1290 if value is not None: 1291 for el in self: 1292 if el.get('value') == value: 1293 checked_option = el 1294 break 1295 else: 1296 raise ValueError( 1297 "There is no radio input with the value %r" % value) 1298 for el in self: 1299 if 'checked' in el.attrib: 1300 del el.attrib['checked'] 1301 if value is not None: 1302 checked_option.set('checked', '')

1303

1304 - def _value__del(self):

1305 self.value = None

1306 1307 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1308

1309 - def value_options(self):

1310 """ 1311 Returns a list of all the possible values. 1312 """ 1313 return [el.get('value') for el in self]

1314 value_options = property(value_options, doc=value_options.__doc__) 1315

1316 - def __repr__(self):

1317 return '%s(%s)' % ( 1318 self.__class__.__name__, 1319 list.__repr__(self))

1320

1321 -class CheckboxGroup(list):

1322 """ 1323 Represents a group of checkboxes (``<input type=checkbox>``) that 1324 have the same name. 1325 1326 In addition to using this like a list, the ``.value`` attribute 1327 returns a set-like object that you can add to or remove from to 1328 check and uncheck checkboxes. You can also use ``.value_options`` 1329 to get the possible values. 1330 """ 1331

1332 - def _value__get(self):

1333 """ 1334 Return a set-like object that can be modified to check or 1335 uncheck individual checkboxes according to their value. 1336 """ 1337 return CheckboxValues(self)

1338 - def _value__set(self, value):

1339 self.value.clear() 1340 if not hasattr(value, '__iter__'): 1341 raise ValueError( 1342 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1343 % (self[0].name, value)) 1344 self.value.update(value)

1345 - def _value__del(self):

1346 self.value.clear()

1347 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1348

1349 - def value_options(self):

1350 """ 1351 Returns a list of all the possible values. 1352 """ 1353 return [el.get('value') for el in self]

1354 value_options = property(value_options, doc=value_options.__doc__) 1355

1356 - def __repr__(self):

1357 return '%s(%s)' % ( 1358 self.__class__.__name__, list.__repr__(self))

1359

1360 -class CheckboxValues(SetMixin):

1361 1362 """ 1363 Represents the values of the checked checkboxes in a group of 1364 checkboxes with the same name. 1365 """ 1366

1367 - def __init__(self, group):

1368 self.group = group

1369

1370 - def __iter__(self):

1371 return iter([ 1372 el.get('value') 1373 for el in self.group 1374 if 'checked' in el.attrib])

1375

1376 - def add(self, value):

1377 for el in self.group: 1378 if el.get('value') == value: 1379 el.set('checked', '') 1380 break 1381 else: 1382 raise KeyError("No checkbox with value %r" % value)

1383

1384 - def remove(self, value):

1385 for el in self.group: 1386 if el.get('value') == value: 1387 if 'checked' in el.attrib: 1388 del el.attrib['checked'] 1389 else: 1390 raise KeyError( 1391 "The checkbox with value %r was already unchecked" % value) 1392 break 1393 else: 1394 raise KeyError( 1395 "No checkbox with value %r" % value)

1396

1397 - def __repr__(self):

1398 return '<%s {%s} for checkboxes name=%r>' % ( 1399 self.__class__.__name__, 1400 ', '.join([repr(v) for v in self]), 1401 self.group.name)

1402

1403 -class InputElement(InputMixin, HtmlElement):

1404 """ 1405 Represents an ``<input>`` element. 1406 1407 You can get the type with ``.type`` (which is lower-cased and 1408 defaults to ``'text'``). 1409 1410 Also you can get and set the value with ``.value`` 1411 1412 Checkboxes and radios have the attribute ``input.checkable == 1413 True`` (for all others it is false) and a boolean attribute 1414 ``.checked``. 1415 1416 """ 1417 1418 ## FIXME: I'm a little uncomfortable with the use of .checked

1419 - def _value__get(self):

1420 """ 1421 Get/set the value of this element, using the ``value`` attribute. 1422 1423 Also, if this is a checkbox and it has no value, this defaults 1424 to ``'on'``. If it is a checkbox or radio that is not 1425 checked, this returns None. 1426 """ 1427 if self.checkable: 1428 if self.checked: 1429 return self.get('value') or 'on' 1430 else: 1431 return None 1432 return self.get('value')

1433 - def _value__set(self, value):

1434 if self.checkable: 1435 if not value: 1436 self.checked = False 1437 else: 1438 self.checked = True 1439 if isinstance(value, basestring): 1440 self.set('value', value) 1441 else: 1442 self.set('value', value)

1443 - def _value__del(self):

1444 if self.checkable: 1445 self.checked = False 1446 else: 1447 if 'value' in self.attrib: 1448 del self.attrib['value']

1449 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1450

1451 - def _type__get(self):

1452 """ 1453 Return the type of this element (using the type attribute). 1454 """ 1455 return self.get('type', 'text').lower()

1456 - def _type__set(self, value):

1457 self.set('type', value)

1458 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1459

1460 - def checkable(self):

1461 """ 1462 Boolean: can this element be checked? 1463 """ 1464 return self.type in ['checkbox', 'radio']

1465 checkable = property(checkable, doc=checkable.__doc__) 1466

1467 - def _checked__get(self):

1468 """ 1469 Boolean attribute to get/set the presence of the ``checked`` 1470 attribute. 1471 1472 You can only use this on checkable input types. 1473 """ 1474 if not self.checkable: 1475 raise AttributeError('Not a checkable input type') 1476 return 'checked' in self.attrib

1477 - def _checked__set(self, value):

1478 if not self.checkable: 1479 raise AttributeError('Not a checkable input type') 1480 if value: 1481 self.set('checked', '') 1482 else: 1483 if 'checked' in self.attrib: 1484 del self.attrib['checked']

1485 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1486 1487 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1488

1489 -class LabelElement(HtmlElement):

1490 """ 1491 Represents a ``<label>`` element. 1492 1493 Label elements are linked to other elements with their ``for`` 1494 attribute. You can access this element with ``label.for_element``. 1495 """ 1496

1497 - def _for_element__get(self):

1498 """ 1499 Get/set the element this label points to. Return None if it 1500 can't be found. 1501 """ 1502 id = self.get('for') 1503 if not id: 1504 return None 1505 return self.body.get_element_by_id(id)

1506 - def _for_element__set(self, other):

1507 id = other.get('id') 1508 if not id: 1509 raise TypeError( 1510 "Element %r has no id attribute" % other) 1511 self.set('for', id)

1512 - def _for_element__del(self):

1513 if 'id' in self.attrib: 1514 del self.attrib['id']

1515 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1516 doc=_for_element__get.__doc__)

1517 1518 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1519 1520 ############################################################ 1521 ## Serialization 1522 ############################################################ 1523

1524 -def html_to_xhtml(html):

1525 """Convert all tags in an HTML tree to XHTML by moving them to the 1526 XHTML namespace. 1527 """ 1528 try: 1529 html = html.getroot() 1530 except AttributeError: 1531 pass 1532 prefix = "{%s}" % XHTML_NAMESPACE 1533 for el in html.iter(etree.Element): 1534 tag = el.tag 1535 if tag[0] != '{': 1536 el.tag = prefix + tag

1537

1538 -def xhtml_to_html(xhtml):

1539 """Convert all tags in an XHTML tree to HTML by removing their 1540 XHTML namespace. 1541 """ 1542 try: 1543 xhtml = xhtml.getroot() 1544 except AttributeError: 1545 pass 1546 prefix = "{%s}" % XHTML_NAMESPACE 1547 prefix_len = len(prefix) 1548 for el in xhtml.iter(prefix + "*"): 1549 el.tag = el.tag[prefix_len:]

1550 1551 # This isn't a general match, but it's a match for what libxml2 1552 # specifically serialises: 1553 __str_replace_meta_content_type = re.compile( 1554 r'<meta http-equiv="Content-Type"[^>]*>').sub 1555 __bytes_replace_meta_content_type = re.compile( 1556 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1557

1558 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1559 encoding=None, method="html", with_tail=True, doctype=None):

1560 """Return an HTML string representation of the document. 1561 1562 Note: if include_meta_content_type is true this will create a 1563 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1564 regardless of the value of include_meta_content_type any existing 1565 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1566 1567 The ``encoding`` argument controls the output encoding (defauts to 1568 ASCII, with &#...; character references for any characters outside 1569 of ASCII). Note that you can pass the name ``'unicode'`` as 1570 ``encoding`` argument to serialise to a Unicode string. 1571 1572 The ``method`` argument defines the output method. It defaults to 1573 'html', but can also be 'xml' for xhtml output, or 'text' to 1574 serialise to plain text without markup. 1575 1576 To leave out the tail text of the top-level element that is being 1577 serialised, pass ``with_tail=False``. 1578 1579 The ``doctype`` option allows passing in a plain string that will 1580 be serialised before the XML tree. Note that passing in non 1581 well-formed content here will make the XML output non well-formed. 1582 Also, an existing doctype in the document tree will not be removed 1583 when serialising an ElementTree instance. 1584 1585 Example:: 1586 1587 >>> from lxml import html 1588 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1589 1590 >>> html.tostring(root) 1591 b'<p>Hello<br>world!</p>' 1592 >>> html.tostring(root, method='html') 1593 b'<p>Hello<br>world!</p>' 1594 1595 >>> html.tostring(root, method='xml') 1596 b'<p>Hello<br/>world!</p>' 1597 1598 >>> html.tostring(root, method='text') 1599 b'Helloworld!' 1600 1601 >>> html.tostring(root, method='text', encoding='unicode') 1602 u'Helloworld!' 1603 1604 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1605 >>> html.tostring(root[0], method='text', encoding='unicode') 1606 u'Helloworld!TAIL' 1607 1608 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False) 1609 u'Helloworld!' 1610 1611 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1612 >>> html.tostring(doc, method='html', encoding='unicode') 1613 u'<html><body><p>Hello<br>world!</p></body></html>' 1614 1615 >>> print(html.tostring(doc, method='html', encoding='unicode', 1616 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1617 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1618 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1619 <html><body><p>Hello<br>world!</p></body></html> 1620 """ 1621 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1622 encoding=encoding, with_tail=with_tail, 1623 doctype=doctype) 1624 if method == 'html' and not include_meta_content_type: 1625 if isinstance(html, str): 1626 html = __str_replace_meta_content_type('', html) 1627 else: 1628 html = __bytes_replace_meta_content_type(bytes(), html) 1629 return html

1630 1631 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1632

1633 -def open_in_browser(doc, encoding=None):

1634 """ 1635 Open the HTML document in a web browser, saving it to a temporary 1636 file to open it. Note that this does not delete the file after 1637 use. This is mainly meant for debugging. 1638 """ 1639 import os 1640 import webbrowser 1641 import tempfile 1642 if not isinstance(doc, etree._ElementTree): 1643 doc = etree.ElementTree(doc) 1644 handle, fn = tempfile.mkstemp(suffix='.html') 1645 f = os.fdopen(handle, 'wb') 1646 try: 1647 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1648 finally: 1649 # we leak the file itself here, but we should at least close it 1650 f.close() 1651 url = 'file://' + fn.replace(os.path.sep, '/') 1652 print(url) 1653 webbrowser.open(url)

1654 1655 ################################################################################ 1656 # configure Element class lookup 1657 ################################################################################ 1658

1659 -class HTMLParser(etree.HTMLParser):

1660 """An HTML parser that is configured to return lxml.html Element 1661 objects. 1662 """

1663 - def __init__(self, **kwargs):

1664 super(HTMLParser, self).__init__(**kwargs) 1665 self.set_element_class_lookup(HtmlElementClassLookup())

1666

1667 -class XHTMLParser(etree.XMLParser):

1668 """An XML parser that is configured to return lxml.html Element 1669 objects. 1670 1671 Note that this parser is not really XHTML aware unless you let it 1672 load a DTD that declares the HTML entities. To do this, make sure 1673 you have the XHTML DTDs installed in your catalogs, and create the 1674 parser like this:: 1675 1676 >>> parser = XHTMLParser(load_dtd=True) 1677 1678 If you additionally want to validate the document, use this:: 1679 1680 >>> parser = XHTMLParser(dtd_validation=True) 1681 1682 For catalog support, see http://www.xmlsoft.org/catalog.html. 1683 """

1684 - def __init__(self, **kwargs):

1685 super(XHTMLParser, self).__init__(**kwargs) 1686 self.set_element_class_lookup(HtmlElementClassLookup())

1687

1688 -def Element(*args, **kw):

1689 """Create a new HTML Element. 1690 1691 This can also be used for XHTML documents. 1692 """ 1693 v = html_parser.makeelement(*args, **kw) 1694 return v

1695 1696 html_parser = HTMLParser() 1697 xhtml_parser = XHTMLParser() 1698

Source Code for Package lxml.html