lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 import sys 35 import re 36 try: 37 from urlparse import urljoin 38 except ImportError: 39 # Python 3 40 from urllib.parse import urljoin 41 import copy 42 from lxml import etree 43 from lxml.html import defs 44 from lxml.html._setmixin import SetMixin 45 try: 46 from collections import MutableMapping as DictMixin 47 except ImportError: 48 # Python < 2.6 49 from UserDict import DictMixin 50 try: 51 set 52 except NameError: 53 # Python 2.3 54 from sets import Set as set 55 try: 56 bytes 57 except NameError: 58 # Python < 2.6 59 bytes = str 60 try: 61 unicode 62 except NameError: 63 # Python 3 64 unicode = str 65 try: 66 basestring 67 except NameError: 68 # Python 3 69 basestring = (str, bytes) 70

71 -def __fix_docstring(s):

72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)

80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) 100 _css_import_re = re.compile(r'@import "(.*?)"') 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104

105 -def _unquote_match(s, pos):

106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos

110

111 -def _transform_result(typ, result):

112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding=unicode) 118 else: 119 return result

120

121 -def _nons(tag):

122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag

126

127 -class HtmlMixin(object):

128

129 - def base_url(self):

130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL

137 base_url = property(base_url, doc=base_url.__doc__) 138

139 - def forms(self):

140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)

144 forms = property(forms, doc=forms.__doc__) 145

146 - def body(self):

147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

152 body = property(body, doc=body.__doc__) 153

154 - def head(self):

155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

160 head = property(head, doc=head.__doc__) 161

162 - def _label__get(self):

163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]

174 - def _label__set(self, label):

175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)

185 - def _label__del(self):

186 label = self.label 187 if label is not None: 188 del label.attrib['for']

189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190

191 - def drop_tree(self):

192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)

206

207 - def drop_tag(self):

208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding=unicode)) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]

238

239 - def find_rel_links(self, rel):

240 """ 241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 242 """ 243 rel = rel.lower() 244 return [el for el in _rel_links_xpath(self) 245 if el.get('rel').lower() == rel]

246

247 - def find_class(self, class_name):

248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)

252

253 - def get_element_by_id(self, id, *default):

254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)

273

274 - def text_content(self):

275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)

279

280 - def cssselect(self, expr, translator='html'):

281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)

292 293 ######################################## 294 ## Link functions 295 ######################################## 296

297 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

298 """ 299 Make all links in the document absolute, given the 300 ``base_url`` for the document (the full URL where the document 301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 302 303 If ``resolve_base_href`` is true, then any ``<base href>`` 304 tags in the document are used *and* removed from the document. 305 If it is false then any such tag is ignored. 306 """ 307 if base_url is None: 308 base_url = self.base_url 309 if base_url is None: 310 raise TypeError( 311 "No base_url given, and the document has no base_url") 312 if resolve_base_href: 313 self.resolve_base_href() 314 def link_repl(href): 315 return urljoin(base_url, href)

316 self.rewrite_links(link_repl)

317

318 - def resolve_base_href(self):

319 """ 320 Find any ``<base href>`` tag in the document, and apply its 321 values to all links found in the document. Also remove the 322 tag once it has been applied. 323 """ 324 base_href = None 325 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 326 for b in basetags: 327 base_href = b.get('href') 328 b.drop_tree() 329 if not base_href: 330 return 331 self.make_links_absolute(base_href, resolve_base_href=False)

332

333 - def iterlinks(self):

334 """ 335 Yield (element, attribute, link, pos), where attribute may be None 336 (indicating the link is in the text). ``pos`` is the position 337 where the link occurs; often 0, but sometimes something else in 338 the case of links in stylesheets or style tags. 339 340 Note: <base href> is *not* taken into account in any way. The 341 link you get is exactly the link in the document. 342 343 Note: multiple links inside of a single text string or 344 attribute value are returned in reversed order. This makes it 345 possible to replace or delete them from the text string value 346 based on their reported text positions. Otherwise, a 347 modification at one text position can change the positions of 348 links reported later on. 349 """ 350 link_attrs = defs.link_attrs 351 for el in self.iter(): 352 attribs = el.attrib 353 tag = _nons(el.tag) 354 if tag != 'object': 355 for attrib in link_attrs: 356 if attrib in attribs: 357 yield (el, attrib, attribs[attrib], 0) 358 elif tag == 'object': 359 codebase = None 360 ## <object> tags have attributes that are relative to 361 ## codebase 362 if 'codebase' in attribs: 363 codebase = el.get('codebase') 364 yield (el, 'codebase', codebase, 0) 365 for attrib in 'classid', 'data': 366 if attrib in attribs: 367 value = el.get(attrib) 368 if codebase is not None: 369 value = urljoin(codebase, value) 370 yield (el, attrib, value, 0) 371 if 'archive' in attribs: 372 for match in _archive_re.finditer(el.get('archive')): 373 value = match.group(0) 374 if codebase is not None: 375 value = urljoin(codebase, value) 376 yield (el, 'archive', value, match.start()) 377 if tag == 'param': 378 valuetype = el.get('valuetype') or '' 379 if valuetype.lower() == 'ref': 380 ## FIXME: while it's fine we *find* this link, 381 ## according to the spec we aren't supposed to 382 ## actually change the value, including resolving 383 ## it. It can also still be a link, even if it 384 ## doesn't have a valuetype="ref" (which seems to be the norm) 385 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 386 yield (el, 'value', el.get('value'), 0) 387 if tag == 'style' and el.text: 388 urls = [ 389 _unquote_match(match.group(1), match.start(1)) 390 for match in _css_url_re.finditer(el.text) 391 ] + [ 392 (match.group(1), match.start(1)) 393 for match in _css_import_re.finditer(el.text) 394 ] 395 if urls: 396 # sort by start pos to bring both match sets back into order 397 urls = [ (start, url) for (url, start) in urls ] 398 urls.sort() 399 # reverse the list to report correct positions despite 400 # modifications 401 urls.reverse() 402 for start, url in urls: 403 yield (el, None, url, start) 404 if 'style' in attribs: 405 urls = list(_css_url_re.finditer(attribs['style'])) 406 if urls: 407 # return in reversed order to simplify in-place modifications 408 for match in urls[::-1]: 409 url, start = _unquote_match(match.group(1), match.start(1)) 410 yield (el, 'style', url, start)

411

412 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 413 base_href=None):

414 """ 415 Rewrite all the links in the document. For each link 416 ``link_repl_func(link)`` will be called, and the return value 417 will replace the old link. 418 419 Note that links may not be absolute (unless you first called 420 ``make_links_absolute()``), and may be internal (e.g., 421 ``'#anchor'``). They can also be values like 422 ``'mailto:email'`` or ``'javascript:expr'``. 423 424 If you give ``base_href`` then all links passed to 425 ``link_repl_func()`` will take that into account. 426 427 If the ``link_repl_func`` returns None, the attribute or 428 tag text will be removed completely. 429 """ 430 if base_href is not None: 431 # FIXME: this can be done in one pass with a wrapper 432 # around link_repl_func 433 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 434 elif resolve_base_href: 435 self.resolve_base_href() 436 for el, attrib, link, pos in self.iterlinks(): 437 new_link = link_repl_func(link.strip()) 438 if new_link == link: 439 continue 440 if new_link is None: 441 # Remove the attribute or element content 442 if attrib is None: 443 el.text = '' 444 else: 445 del el.attrib[attrib] 446 continue 447 if attrib is None: 448 new = el.text[:pos] + new_link + el.text[pos+len(link):] 449 el.text = new 450 else: 451 cur = el.attrib[attrib] 452 if not pos and len(cur) == len(link): 453 # Most common case 454 el.attrib[attrib] = new_link 455 else: 456 new = cur[:pos] + new_link + cur[pos+len(link):] 457 el.attrib[attrib] = new

458 459

460 -class _MethodFunc(object):

461 """ 462 An object that represents a method on an element as a function; 463 the function takes either an element or an HTML string. It 464 returns whatever the function normally returns, or if the function 465 works in-place (and so returns None) it returns a serialized form 466 of the resulting document. 467 """

468 - def __init__(self, name, copy=False, source_class=HtmlMixin):

469 self.name = name 470 self.copy = copy 471 self.__doc__ = getattr(source_class, self.name).__doc__

472 - def __call__(self, doc, *args, **kw):

473 result_type = type(doc) 474 if isinstance(doc, basestring): 475 if 'copy' in kw: 476 raise TypeError( 477 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 478 doc = fromstring(doc, **kw) 479 else: 480 if 'copy' in kw: 481 make_a_copy = kw.pop('copy') 482 else: 483 make_a_copy = self.copy 484 if make_a_copy: 485 doc = copy.deepcopy(doc) 486 meth = getattr(doc, self.name) 487 result = meth(*args, **kw) 488 # FIXME: this None test is a bit sloppy 489 if result is None: 490 # Then return what we got in 491 return _transform_result(result_type, doc) 492 else: 493 return result

494 495 find_rel_links = _MethodFunc('find_rel_links', copy=False) 496 find_class = _MethodFunc('find_class', copy=False) 497 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 498 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 499 iterlinks = _MethodFunc('iterlinks', copy=False) 500 rewrite_links = _MethodFunc('rewrite_links', copy=True) 501

502 -class HtmlComment(etree.CommentBase, HtmlMixin):

503 pass

504

505 -class HtmlElement(etree.ElementBase, HtmlMixin):

506 pass

507

508 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

509 pass

510

511 -class HtmlEntity(etree.EntityBase, HtmlMixin):

512 pass

513 514

515 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

516 """A lookup scheme for HTML Element classes. 517 518 To create a lookup instance with different Element classes, pass a tag 519 name mapping of Element classes in the ``classes`` keyword argument and/or 520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 521 The special key '*' denotes a Mixin class that should be mixed into all 522 Element classes. 523 """ 524 _default_element_classes = {} 525

526 - def __init__(self, classes=None, mixins=None):

527 etree.CustomElementClassLookup.__init__(self) 528 if classes is None: 529 classes = self._default_element_classes.copy() 530 if mixins: 531 mixers = {} 532 for name, value in mixins: 533 if name == '*': 534 for n in classes.keys(): 535 mixers.setdefault(n, []).append(value) 536 else: 537 mixers.setdefault(name, []).append(value) 538 for name, mix_bases in mixers.items(): 539 cur = classes.get(name, HtmlElement) 540 bases = tuple(mix_bases + [cur]) 541 classes[name] = type(cur.__name__, bases, {}) 542 self._element_classes = classes

543

544 - def lookup(self, node_type, document, namespace, name):

545 if node_type == 'element': 546 return self._element_classes.get(name.lower(), HtmlElement) 547 elif node_type == 'comment': 548 return HtmlComment 549 elif node_type == 'PI': 550 return HtmlProcessingInstruction 551 elif node_type == 'entity': 552 return HtmlEntity 553 # Otherwise normal lookup 554 return None

555 556 ################################################################################ 557 # parsing 558 ################################################################################ 559 560 _looks_like_full_html_unicode = re.compile( 561 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 562 _looks_like_full_html_bytes = re.compile( 563 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 564

565 -def document_fromstring(html, parser=None, **kw):

566 if parser is None: 567 parser = html_parser 568 value = etree.fromstring(html, parser, **kw) 569 if value is None: 570 raise etree.ParserError( 571 "Document is empty") 572 return value

573

574 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 575 parser=None, **kw):

576 """ 577 Parses several HTML elements, returning a list of elements. 578 579 The first item in the list may be a string (though leading 580 whitespace is removed). If no_leading_text is true, then it will 581 be an error if there is leading text, and it will always be a list 582 of only elements. 583 584 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 585 """ 586 if parser is None: 587 parser = html_parser 588 # FIXME: check what happens when you give html with a body, head, etc. 589 if isinstance(html, bytes): 590 if not _looks_like_full_html_bytes(html): 591 html = '<html><body>%s</body></html>'.encode('ascii') % html 592 else: 593 if not _looks_like_full_html_unicode(html): 594 html = '<html><body>%s</body></html>' % html 595 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 596 assert _nons(doc.tag) == 'html' 597 bodies = [e for e in doc if _nons(e.tag) == 'body'] 598 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 599 body = bodies[0] 600 elements = [] 601 if no_leading_text and body.text and body.text.strip(): 602 raise etree.ParserError( 603 "There is leading text: %r" % body.text) 604 if body.text and body.text.strip(): 605 elements.append(body.text) 606 elements.extend(body) 607 # FIXME: removing the reference to the parent artificial document 608 # would be nice 609 return elements

610

611 -def fragment_fromstring(html, create_parent=False, base_url=None, 612 parser=None, **kw):

613 """ 614 Parses a single HTML element; it is an error if there is more than 615 one element, or if anything but whitespace precedes or follows the 616 element. 617 618 If create_parent is true (or is a tag name) then a parent node 619 will be created to encapsulate the HTML in a single element. In 620 this case, leading or trailing text is allowed. 621 622 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 623 """ 624 if parser is None: 625 parser = html_parser 626 627 accept_leading_text = bool(create_parent) 628 629 elements = fragments_fromstring( 630 html, parser=parser, no_leading_text=not accept_leading_text, 631 base_url=base_url, **kw) 632 633 if create_parent: 634 if not isinstance(create_parent, basestring): 635 create_parent = 'div' 636 new_root = Element(create_parent) 637 if elements: 638 if isinstance(elements[0], basestring): 639 new_root.text = elements[0] 640 del elements[0] 641 new_root.extend(elements) 642 return new_root 643 644 if not elements: 645 raise etree.ParserError('No elements found') 646 if len(elements) > 1: 647 raise etree.ParserError( 648 "Multiple elements found (%s)" 649 % ', '.join([_element_name(e) for e in elements])) 650 el = elements[0] 651 if el.tail and el.tail.strip(): 652 raise etree.ParserError( 653 "Element followed by text: %r" % el.tail) 654 el.tail = None 655 return el

656

657 -def fromstring(html, base_url=None, parser=None, **kw):

658 """ 659 Parse the html, returning a single element/document. 660 661 This tries to minimally parse the chunk of text, without knowing if it 662 is a fragment or a document. 663 664 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 665 """ 666 if parser is None: 667 parser = html_parser 668 if isinstance(html, bytes): 669 is_full_html = _looks_like_full_html_bytes(html) 670 else: 671 is_full_html = _looks_like_full_html_unicode(html) 672 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 673 if is_full_html: 674 return doc 675 # otherwise, lets parse it out... 676 bodies = doc.findall('body') 677 if not bodies: 678 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 679 if bodies: 680 body = bodies[0] 681 if len(bodies) > 1: 682 # Somehow there are multiple bodies, which is bad, but just 683 # smash them into one body 684 for other_body in bodies[1:]: 685 if other_body.text: 686 if len(body): 687 body[-1].tail = (body[-1].tail or '') + other_body.text 688 else: 689 body.text = (body.text or '') + other_body.text 690 body.extend(other_body) 691 # We'll ignore tail 692 # I guess we are ignoring attributes too 693 other_body.drop_tree() 694 else: 695 body = None 696 heads = doc.findall('head') 697 if not heads: 698 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 699 if heads: 700 # Well, we have some sort of structure, so lets keep it all 701 head = heads[0] 702 if len(heads) > 1: 703 for other_head in heads[1:]: 704 head.extend(other_head) 705 # We don't care about text or tail in a head 706 other_head.drop_tree() 707 return doc 708 if body is None: 709 return doc 710 if (len(body) == 1 and (not body.text or not body.text.strip()) 711 and (not body[-1].tail or not body[-1].tail.strip())): 712 # The body has just one element, so it was probably a single 713 # element passed in 714 return body[0] 715 # Now we have a body which represents a bunch of tags which have the 716 # content that was passed in. We will create a fake container, which 717 # is the body tag, except <body> implies too much structure. 718 if _contains_block_level_tag(body): 719 body.tag = 'div' 720 else: 721 body.tag = 'span' 722 return body

723

724 -def parse(filename_or_url, parser=None, base_url=None, **kw):

725 """ 726 Parse a filename, URL, or file-like object into an HTML document 727 tree. Note: this returns a tree, not an element. Use 728 ``parse(...).getroot()`` to get the document root. 729 730 You can override the base URL with the ``base_url`` keyword. This 731 is most useful when parsing from a file-like object. 732 """ 733 if parser is None: 734 parser = html_parser 735 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

736

737 -def _contains_block_level_tag(el):

738 # FIXME: I could do this with XPath, but would that just be 739 # unnecessarily slow? 740 for el in el.iter(): 741 if _nons(el.tag) in defs.block_tags: 742 return True 743 return False

744

745 -def _element_name(el):

746 if isinstance(el, etree.CommentBase): 747 return 'comment' 748 elif isinstance(el, basestring): 749 return 'string' 750 else: 751 return _nons(el.tag)

752 753 ################################################################################ 754 # form handling 755 ################################################################################ 756

757 -class FormElement(HtmlElement):

758 """ 759 Represents a <form> element. 760 """ 761

762 - def inputs(self):

763 """ 764 Returns an accessor for all the input elements in the form. 765 766 See `InputGetter` for more information about the object. 767 """ 768 return InputGetter(self)

769 inputs = property(inputs, doc=inputs.__doc__) 770

771 - def _fields__get(self):

772 """ 773 Dictionary-like object that represents all the fields in this 774 form. You can set values in this dictionary to effect the 775 form. 776 """ 777 return FieldsDict(self.inputs)

778 - def _fields__set(self, value):

779 prev_keys = self.fields.keys() 780 for key, value in value.items(): 781 if key in prev_keys: 782 prev_keys.remove(key) 783 self.fields[key] = value 784 for key in prev_keys: 785 if key is None: 786 # Case of an unnamed input; these aren't really 787 # expressed in form_values() anyway. 788 continue 789 self.fields[key] = None

790 791 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 792

793 - def _name(self):

794 if self.get('name'): 795 return self.get('name') 796 elif self.get('id'): 797 return '#' + self.get('id') 798 forms = list(self.body.iter('form')) 799 if not forms: 800 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 801 return str(forms.index(self))

802

803 - def form_values(self):

804 """ 805 Return a list of tuples of the field values for the form. 806 This is suitable to be passed to ``urllib.urlencode()``. 807 """ 808 results = [] 809 for el in self.inputs: 810 name = el.name 811 if not name: 812 continue 813 tag = _nons(el.tag) 814 if tag == 'textarea': 815 results.append((name, el.value)) 816 elif tag == 'select': 817 value = el.value 818 if el.multiple: 819 for v in value: 820 results.append((name, v)) 821 elif value is not None: 822 results.append((name, el.value)) 823 else: 824 assert tag == 'input', ( 825 "Unexpected tag: %r" % el) 826 if el.checkable and not el.checked: 827 continue 828 if el.type in ('submit', 'image', 'reset'): 829 continue 830 value = el.value 831 if value is not None: 832 results.append((name, el.value)) 833 return results

834

835 - def _action__get(self):

836 """ 837 Get/set the form's ``action`` attribute. 838 """ 839 base_url = self.base_url 840 action = self.get('action') 841 if base_url and action is not None: 842 return urljoin(base_url, action) 843 else: 844 return action

845 - def _action__set(self, value):

846 self.set('action', value)

847 - def _action__del(self):

848 if 'action' in self.attrib: 849 del self.attrib['action']

850 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 851

852 - def _method__get(self):

853 """ 854 Get/set the form's method. Always returns a capitalized 855 string, and defaults to ``'GET'`` 856 """ 857 return self.get('method', 'GET').upper()

858 - def _method__set(self, value):

859 self.set('method', value.upper())

860 method = property(_method__get, _method__set, doc=_method__get.__doc__)

861 862 HtmlElementClassLookup._default_element_classes['form'] = FormElement 863

864 -def submit_form(form, extra_values=None, open_http=None):

865 """ 866 Helper function to submit a form. Returns a file-like object, as from 867 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 868 which shows the URL if there were any redirects. 869 870 You can use this like:: 871 872 form = doc.forms[0] 873 form.inputs['foo'].value = 'bar' # etc 874 response = form.submit() 875 doc = parse(response) 876 doc.make_links_absolute(response.geturl()) 877 878 To change the HTTP requester, pass a function as ``open_http`` keyword 879 argument that opens the URL for you. The function must have the following 880 signature:: 881 882 open_http(method, URL, values) 883 884 The action is one of 'GET' or 'POST', the URL is the target URL as a 885 string, and the values are a sequence of ``(name, value)`` tuples with the 886 form data. 887 """ 888 values = form.form_values() 889 if extra_values: 890 if hasattr(extra_values, 'items'): 891 extra_values = extra_values.items() 892 values.extend(extra_values) 893 if open_http is None: 894 open_http = open_http_urllib 895 if form.action: 896 url = form.action 897 else: 898 url = form.base_url 899 return open_http(form.method, url, values)

900

901 -def open_http_urllib(method, url, values):

902 if not url: 903 raise ValueError("cannot submit, no URL provided") 904 ## FIXME: should test that it's not a relative URL or something 905 try: 906 from urllib import urlencode, urlopen 907 except ImportError: # Python 3 908 from urllib.request import urlopen 909 from urllib.parse import urlencode 910 if method == 'GET': 911 if '?' in url: 912 url += '&' 913 else: 914 url += '?' 915 url += urlencode(values) 916 data = None 917 else: 918 data = urlencode(values) 919 return urlopen(url, data)

920

921 -class FieldsDict(DictMixin):

922

923 - def __init__(self, inputs):

924 self.inputs = inputs

925 - def __getitem__(self, item):

926 return self.inputs[item].value

927 - def __setitem__(self, item, value):

928 self.inputs[item].value = value

929 - def __delitem__(self, item):

930 raise KeyError( 931 "You cannot remove keys from ElementDict")

932 - def keys(self):

933 return self.inputs.keys()

934 - def __contains__(self, item):

935 return item in self.inputs

936 - def __iter__(self):

937 return iter(self.inputs.keys())

938 - def __len__(self):

939 return len(self.inputs)

940

941 - def __repr__(self):

942 return '<%s for form %s>' % ( 943 self.__class__.__name__, 944 self.inputs.form._name())

945

946 -class InputGetter(object):

947 948 """ 949 An accessor that represents all the input fields in a form. 950 951 You can get fields by name from this, with 952 ``form.inputs['field_name']``. If there are a set of checkboxes 953 with the same name, they are returned as a list (a `CheckboxGroup` 954 which also allows value setting). Radio inputs are handled 955 similarly. 956 957 You can also iterate over this to get all input elements. This 958 won't return the same thing as if you get all the names, as 959 checkboxes and radio elements are returned individually. 960 """ 961 962 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 963 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 964

965 - def __init__(self, form):

966 self.form = form

967

968 - def __repr__(self):

969 return '<%s for form %s>' % ( 970 self.__class__.__name__, 971 self.form._name())

972 973 ## FIXME: there should be more methods, and it's unclear if this is 974 ## a dictionary-like object or list-like object 975

976 - def __getitem__(self, name):

977 results = self._name_xpath(self.form, name=name) 978 if results: 979 type = results[0].get('type') 980 if type == 'radio' and len(results) > 1: 981 group = RadioGroup(results) 982 group.name = name 983 return group 984 elif type == 'checkbox' and len(results) > 1: 985 group = CheckboxGroup(results) 986 group.name = name 987 return group 988 else: 989 # I don't like throwing away elements like this 990 return results[0] 991 else: 992 raise KeyError( 993 "No input element with the name %r" % name)

994

995 - def __contains__(self, name):

996 results = self._name_xpath(self.form, name=name) 997 return bool(results)

998

999 - def keys(self):

1000 names = set() 1001 for el in self: 1002 names.add(el.name) 1003 if None in names: 1004 names.remove(None) 1005 return list(names)

1006

1007 - def __iter__(self):

1008 ## FIXME: kind of dumb to turn a list into an iterator, only 1009 ## to have it likely turned back into a list again :( 1010 return iter(self._all_xpath(self.form))

1011

1012 -class InputMixin(object):

1013 1014 """ 1015 Mix-in for all input elements (input, select, and textarea) 1016 """ 1017 1018

1019 - def _name__get(self):

1020 """ 1021 Get/set the name of the element 1022 """ 1023 return self.get('name')

1024 - def _name__set(self, value):

1025 self.set('name', value)

1026 - def _name__del(self):

1027 if 'name' in self.attrib: 1028 del self.attrib['name']

1029 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1030

1031 - def __repr__(self):

1032 type = getattr(self, 'type', None) 1033 if type: 1034 type = ' type=%r' % type 1035 else: 1036 type = '' 1037 return '<%s %x name=%r%s>' % ( 1038 self.__class__.__name__, id(self), self.name, type)

1039

1040 -class TextareaElement(InputMixin, HtmlElement):

1041 """ 1042 ``<textarea>`` element. You can get the name with ``.name`` and 1043 get/set the value with ``.value`` 1044 """ 1045

1046 - def _value__get(self):

1047 """ 1048 Get/set the value (which is the contents of this element) 1049 """ 1050 content = self.text or '' 1051 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1052 serialisation_method = 'xml' 1053 else: 1054 serialisation_method = 'html' 1055 for el in self: 1056 # it's rare that we actually get here, so let's not use ''.join() 1057 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1058 return content

1059 - def _value__set(self, value):

1060 del self[:] 1061 self.text = value

1062 - def _value__del(self):

1063 self.text = '' 1064 del self[:]

1065 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

1066 1067 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1068

1069 -class SelectElement(InputMixin, HtmlElement):

1070 """ 1071 ``<select>`` element. You can get the name with ``.name``. 1072 1073 ``.value`` will be the value of the selected option, unless this 1074 is a multi-select element (``<select multiple>``), in which case 1075 it will be a set-like object. In either case ``.value_options`` 1076 gives the possible values. 1077 1078 The boolean attribute ``.multiple`` shows if this is a 1079 multi-select. 1080 """ 1081

1082 - def _value__get(self):

1083 """ 1084 Get/set the value of this select (the selected option). 1085 1086 If this is a multi-select, this is a set-like object that 1087 represents all the selected options. 1088 """ 1089 if self.multiple: 1090 return MultipleSelectOptions(self) 1091 for el in _options_xpath(self): 1092 if el.get('selected') is not None: 1093 value = el.get('value') 1094 if value is None: 1095 value = el.text or '' 1096 if value: 1097 value = value.strip() 1098 return value 1099 return None

1100

1101 - def _value__set(self, value):

1102 if self.multiple: 1103 if isinstance(value, basestring): 1104 raise TypeError( 1105 "You must pass in a sequence") 1106 self.value.clear() 1107 self.value.update(value) 1108 return 1109 if value is not None: 1110 value = value.strip() 1111 for el in _options_xpath(self): 1112 opt_value = el.get('value') 1113 if opt_value is None: 1114 opt_value = el.text or '' 1115 if opt_value: 1116 opt_value = opt_value.strip() 1117 if opt_value == value: 1118 checked_option = el 1119 break 1120 else: 1121 raise ValueError( 1122 "There is no option with the value of %r" % value) 1123 for el in _options_xpath(self): 1124 if 'selected' in el.attrib: 1125 del el.attrib['selected'] 1126 if value is not None: 1127 checked_option.set('selected', '')

1128

1129 - def _value__del(self):

1130 # FIXME: should del be allowed at all? 1131 if self.multiple: 1132 self.value.clear() 1133 else: 1134 self.value = None

1135 1136 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1137

1138 - def value_options(self):

1139 """ 1140 All the possible values this select can have (the ``value`` 1141 attribute of all the ``<option>`` elements. 1142 """ 1143 options = [] 1144 for el in _options_xpath(self): 1145 value = el.get('value') 1146 if value is None: 1147 value = el.text or '' 1148 if value: 1149 value = value.strip() 1150 options.append(value) 1151 return options

1152 value_options = property(value_options, doc=value_options.__doc__) 1153

1154 - def _multiple__get(self):

1155 """ 1156 Boolean attribute: is there a ``multiple`` attribute on this element. 1157 """ 1158 return 'multiple' in self.attrib

1159 - def _multiple__set(self, value):

1160 if value: 1161 self.set('multiple', '') 1162 elif 'multiple' in self.attrib: 1163 del self.attrib['multiple']

1164 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1165 1166 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1167

1168 -class MultipleSelectOptions(SetMixin):

1169 """ 1170 Represents all the selected options in a ``<select multiple>`` element. 1171 1172 You can add to this set-like option to select an option, or remove 1173 to unselect the option. 1174 """ 1175

1176 - def __init__(self, select):

1177 self.select = select

1178

1179 - def options(self):

1180 """ 1181 Iterator of all the ``<option>`` elements. 1182 """ 1183 return iter(_options_xpath(self.select))

1184 options = property(options) 1185

1186 - def __iter__(self):

1187 for option in self.options: 1188 if 'selected' in option.attrib: 1189 opt_value = option.get('value') 1190 if opt_value is None: 1191 opt_value = option.text or '' 1192 if opt_value: 1193 opt_value = opt_value.strip() 1194 yield opt_value

1195

1196 - def add(self, item):

1197 for option in self.options: 1198 opt_value = option.get('value') 1199 if opt_value is None: 1200 opt_value = option.text or '' 1201 if opt_value: 1202 opt_value = opt_value.strip() 1203 if opt_value == item: 1204 option.set('selected', '') 1205 break 1206 else: 1207 raise ValueError( 1208 "There is no option with the value %r" % item)

1209

1210 - def remove(self, item):

1211 for option in self.options: 1212 opt_value = option.get('value') 1213 if opt_value is None: 1214 opt_value = option.text or '' 1215 if opt_value: 1216 opt_value = opt_value.strip() 1217 if opt_value == item: 1218 if 'selected' in option.attrib: 1219 del option.attrib['selected'] 1220 else: 1221 raise ValueError( 1222 "The option %r is not currently selected" % item) 1223 break 1224 else: 1225 raise ValueError( 1226 "There is not option with the value %r" % item)

1227

1228 - def __repr__(self):

1229 return '<%s {%s} for select name=%r>' % ( 1230 self.__class__.__name__, 1231 ', '.join([repr(v) for v in self]), 1232 self.select.name)

1233

1234 -class RadioGroup(list):

1235 """ 1236 This object represents several ``<input type=radio>`` elements 1237 that have the same name. 1238 1239 You can use this like a list, but also use the property 1240 ``.value`` to check/uncheck inputs. Also you can use 1241 ``.value_options`` to get the possible values. 1242 """ 1243

1244 - def _value__get(self):

1245 """ 1246 Get/set the value, which checks the radio with that value (and 1247 unchecks any other value). 1248 """ 1249 for el in self: 1250 if 'checked' in el.attrib: 1251 return el.get('value') 1252 return None

1253

1254 - def _value__set(self, value):

1255 if value is not None: 1256 for el in self: 1257 if el.get('value') == value: 1258 checked_option = el 1259 break 1260 else: 1261 raise ValueError( 1262 "There is no radio input with the value %r" % value) 1263 for el in self: 1264 if 'checked' in el.attrib: 1265 del el.attrib['checked'] 1266 if value is not None: 1267 checked_option.set('checked', '')

1268

1269 - def _value__del(self):

1270 self.value = None

1271 1272 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1273

1274 - def value_options(self):

1275 """ 1276 Returns a list of all the possible values. 1277 """ 1278 return [el.get('value') for el in self]

1279 value_options = property(value_options, doc=value_options.__doc__) 1280

1281 - def __repr__(self):

1282 return '%s(%s)' % ( 1283 self.__class__.__name__, 1284 list.__repr__(self))

1285

1286 -class CheckboxGroup(list):

1287 """ 1288 Represents a group of checkboxes (``<input type=checkbox>``) that 1289 have the same name. 1290 1291 In addition to using this like a list, the ``.value`` attribute 1292 returns a set-like object that you can add to or remove from to 1293 check and uncheck checkboxes. You can also use ``.value_options`` 1294 to get the possible values. 1295 """ 1296

1297 - def _value__get(self):

1298 """ 1299 Return a set-like object that can be modified to check or 1300 uncheck individual checkboxes according to their value. 1301 """ 1302 return CheckboxValues(self)

1303 - def _value__set(self, value):

1304 self.value.clear() 1305 if not hasattr(value, '__iter__'): 1306 raise ValueError( 1307 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1308 % (self[0].name, value)) 1309 self.value.update(value)

1310 - def _value__del(self):

1311 self.value.clear()

1312 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1313

1314 - def value_options(self):

1315 """ 1316 Returns a list of all the possible values. 1317 """ 1318 return [el.get('value') for el in self]

1319 value_options = property(value_options, doc=value_options.__doc__) 1320

1321 - def __repr__(self):

1322 return '%s(%s)' % ( 1323 self.__class__.__name__, list.__repr__(self))

1324

1325 -class CheckboxValues(SetMixin):

1326 1327 """ 1328 Represents the values of the checked checkboxes in a group of 1329 checkboxes with the same name. 1330 """ 1331

1332 - def __init__(self, group):

1333 self.group = group

1334

1335 - def __iter__(self):

1336 return iter([ 1337 el.get('value') 1338 for el in self.group 1339 if 'checked' in el.attrib])

1340

1341 - def add(self, value):

1342 for el in self.group: 1343 if el.get('value') == value: 1344 el.set('checked', '') 1345 break 1346 else: 1347 raise KeyError("No checkbox with value %r" % value)

1348

1349 - def remove(self, value):

1350 for el in self.group: 1351 if el.get('value') == value: 1352 if 'checked' in el.attrib: 1353 del el.attrib['checked'] 1354 else: 1355 raise KeyError( 1356 "The checkbox with value %r was already unchecked" % value) 1357 break 1358 else: 1359 raise KeyError( 1360 "No checkbox with value %r" % value)

1361

1362 - def __repr__(self):

1363 return '<%s {%s} for checkboxes name=%r>' % ( 1364 self.__class__.__name__, 1365 ', '.join([repr(v) for v in self]), 1366 self.group.name)

1367

1368 -class InputElement(InputMixin, HtmlElement):

1369 """ 1370 Represents an ``<input>`` element. 1371 1372 You can get the type with ``.type`` (which is lower-cased and 1373 defaults to ``'text'``). 1374 1375 Also you can get and set the value with ``.value`` 1376 1377 Checkboxes and radios have the attribute ``input.checkable == 1378 True`` (for all others it is false) and a boolean attribute 1379 ``.checked``. 1380 1381 """ 1382 1383 ## FIXME: I'm a little uncomfortable with the use of .checked

1384 - def _value__get(self):

1385 """ 1386 Get/set the value of this element, using the ``value`` attribute. 1387 1388 Also, if this is a checkbox and it has no value, this defaults 1389 to ``'on'``. If it is a checkbox or radio that is not 1390 checked, this returns None. 1391 """ 1392 if self.checkable: 1393 if self.checked: 1394 return self.get('value') or 'on' 1395 else: 1396 return None 1397 return self.get('value')

1398 - def _value__set(self, value):

1399 if self.checkable: 1400 if not value: 1401 self.checked = False 1402 else: 1403 self.checked = True 1404 if isinstance(value, basestring): 1405 self.set('value', value) 1406 else: 1407 self.set('value', value)

1408 - def _value__del(self):

1409 if self.checkable: 1410 self.checked = False 1411 else: 1412 if 'value' in self.attrib: 1413 del self.attrib['value']

1414 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1415

1416 - def _type__get(self):

1417 """ 1418 Return the type of this element (using the type attribute). 1419 """ 1420 return self.get('type', 'text').lower()

1421 - def _type__set(self, value):

1422 self.set('type', value)

1423 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1424

1425 - def checkable(self):

1426 """ 1427 Boolean: can this element be checked? 1428 """ 1429 return self.type in ['checkbox', 'radio']

1430 checkable = property(checkable, doc=checkable.__doc__) 1431

1432 - def _checked__get(self):

1433 """ 1434 Boolean attribute to get/set the presence of the ``checked`` 1435 attribute. 1436 1437 You can only use this on checkable input types. 1438 """ 1439 if not self.checkable: 1440 raise AttributeError('Not a checkable input type') 1441 return 'checked' in self.attrib

1442 - def _checked__set(self, value):

1443 if not self.checkable: 1444 raise AttributeError('Not a checkable input type') 1445 if value: 1446 self.set('checked', '') 1447 else: 1448 if 'checked' in self.attrib: 1449 del self.attrib['checked']

1450 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1451 1452 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1453

1454 -class LabelElement(HtmlElement):

1455 """ 1456 Represents a ``<label>`` element. 1457 1458 Label elements are linked to other elements with their ``for`` 1459 attribute. You can access this element with ``label.for_element``. 1460 """ 1461

1462 - def _for_element__get(self):

1463 """ 1464 Get/set the element this label points to. Return None if it 1465 can't be found. 1466 """ 1467 id = self.get('for') 1468 if not id: 1469 return None 1470 return self.body.get_element_by_id(id)

1471 - def _for_element__set(self, other):

1472 id = other.get('id') 1473 if not id: 1474 raise TypeError( 1475 "Element %r has no id attribute" % other) 1476 self.set('for', id)

1477 - def _for_element__del(self):

1478 if 'id' in self.attrib: 1479 del self.attrib['id']

1480 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1481 doc=_for_element__get.__doc__)

1482 1483 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1484 1485 ############################################################ 1486 ## Serialization 1487 ############################################################ 1488

1489 -def html_to_xhtml(html):

1490 """Convert all tags in an HTML tree to XHTML by moving them to the 1491 XHTML namespace. 1492 """ 1493 try: 1494 html = html.getroot() 1495 except AttributeError: 1496 pass 1497 prefix = "{%s}" % XHTML_NAMESPACE 1498 for el in html.iter(): 1499 tag = el.tag 1500 if isinstance(tag, basestring): 1501 if tag[0] != '{': 1502 el.tag = prefix + tag

1503

1504 -def xhtml_to_html(xhtml):

1505 """Convert all tags in an XHTML tree to HTML by removing their 1506 XHTML namespace. 1507 """ 1508 try: 1509 xhtml = xhtml.getroot() 1510 except AttributeError: 1511 pass 1512 prefix = "{%s}" % XHTML_NAMESPACE 1513 prefix_len = len(prefix) 1514 for el in xhtml.iter(prefix + "*"): 1515 el.tag = el.tag[prefix_len:]

1516 1517 # This isn't a general match, but it's a match for what libxml2 1518 # specifically serialises: 1519 __str_replace_meta_content_type = re.compile( 1520 r'<meta http-equiv="Content-Type"[^>]*>').sub 1521 __bytes_replace_meta_content_type = re.compile( 1522 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1523

1524 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1525 encoding=None, method="html", with_tail=True, doctype=None):

1526 """Return an HTML string representation of the document. 1527 1528 Note: if include_meta_content_type is true this will create a 1529 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1530 regardless of the value of include_meta_content_type any existing 1531 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1532 1533 The ``encoding`` argument controls the output encoding (defauts to 1534 ASCII, with &#...; character references for any characters outside 1535 of ASCII). Note that you can pass the name ``'unicode'`` as 1536 ``encoding`` argument to serialise to a unicode string. 1537 1538 The ``method`` argument defines the output method. It defaults to 1539 'html', but can also be 'xml' for xhtml output, or 'text' to 1540 serialise to plain text without markup. 1541 1542 To leave out the tail text of the top-level element that is being 1543 serialised, pass ``with_tail=False``. 1544 1545 The ``doctype`` option allows passing in a plain string that will 1546 be serialised before the XML tree. Note that passing in non 1547 well-formed content here will make the XML output non well-formed. 1548 Also, an existing doctype in the document tree will not be removed 1549 when serialising an ElementTree instance. 1550 1551 Example:: 1552 1553 >>> from lxml import html 1554 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1555 1556 >>> html.tostring(root) 1557 b'<p>Hello<br>world!</p>' 1558 >>> html.tostring(root, method='html') 1559 b'<p>Hello<br>world!</p>' 1560 1561 >>> html.tostring(root, method='xml') 1562 b'<p>Hello<br/>world!</p>' 1563 1564 >>> html.tostring(root, method='text') 1565 b'Helloworld!' 1566 1567 >>> html.tostring(root, method='text', encoding=unicode) 1568 u'Helloworld!' 1569 1570 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1571 >>> html.tostring(root[0], method='text', encoding=unicode) 1572 u'Helloworld!TAIL' 1573 1574 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 1575 u'Helloworld!' 1576 1577 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1578 >>> html.tostring(doc, method='html', encoding=unicode) 1579 u'<html><body><p>Hello<br>world!</p></body></html>' 1580 1581 >>> print(html.tostring(doc, method='html', encoding=unicode, 1582 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1583 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1584 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1585 <html><body><p>Hello<br>world!</p></body></html> 1586 """ 1587 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1588 encoding=encoding, with_tail=with_tail, 1589 doctype=doctype) 1590 if method == 'html' and not include_meta_content_type: 1591 if isinstance(html, str): 1592 html = __str_replace_meta_content_type('', html) 1593 else: 1594 html = __bytes_replace_meta_content_type(bytes(), html) 1595 return html

1596 1597 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1598

1599 -def open_in_browser(doc, encoding=None):

1600 """ 1601 Open the HTML document in a web browser, saving it to a temporary 1602 file to open it. Note that this does not delete the file after 1603 use. This is mainly meant for debugging. 1604 """ 1605 import os 1606 import webbrowser 1607 import tempfile 1608 if not isinstance(doc, etree._ElementTree): 1609 doc = etree.ElementTree(doc) 1610 handle, fn = tempfile.mkstemp(suffix='.html') 1611 f = os.fdopen(handle, 'wb') 1612 try: 1613 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1614 finally: 1615 # we leak the file itself here, but we should at least close it 1616 f.close() 1617 url = 'file://' + fn.replace(os.path.sep, '/') 1618 print(url) 1619 webbrowser.open(url)

1620 1621 ################################################################################ 1622 # configure Element class lookup 1623 ################################################################################ 1624

1625 -class HTMLParser(etree.HTMLParser):

1626 """An HTML parser that is configured to return lxml.html Element 1627 objects. 1628 """

1629 - def __init__(self, **kwargs):

1630 super(HTMLParser, self).__init__(**kwargs) 1631 self.set_element_class_lookup(HtmlElementClassLookup())

1632

1633 -class XHTMLParser(etree.XMLParser):

1634 """An XML parser that is configured to return lxml.html Element 1635 objects. 1636 1637 Note that this parser is not really XHTML aware unless you let it 1638 load a DTD that declares the HTML entities. To do this, make sure 1639 you have the XHTML DTDs installed in your catalogs, and create the 1640 parser like this:: 1641 1642 >>> parser = XHTMLParser(load_dtd=True) 1643 1644 If you additionally want to validate the document, use this:: 1645 1646 >>> parser = XHTMLParser(dtd_validation=True) 1647 1648 For catalog support, see http://www.xmlsoft.org/catalog.html. 1649 """

1650 - def __init__(self, **kwargs):

1651 super(XHTMLParser, self).__init__(**kwargs) 1652 self.set_element_class_lookup(HtmlElementClassLookup())

1653

1654 -def Element(*args, **kw):

1655 """Create a new HTML Element. 1656 1657 This can also be used for XHTML documents. 1658 """ 1659 v = html_parser.makeelement(*args, **kw) 1660 return v

1661 1662 html_parser = HTMLParser() 1663 xhtml_parser = XHTMLParser() 1664

Source Code for Package lxml.html