lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 import threading 35 import re 36 try: 37 from urlparse import urljoin 38 except ImportError: 39 # Python 3 40 from urllib.parse import urljoin 41 import copy 42 from lxml import etree 43 from lxml.html import defs 44 from lxml.html._setmixin import SetMixin 45 try: 46 from collections import MutableMapping as DictMixin 47 except ImportError: 48 # Python < 2.6 49 from UserDict import DictMixin 50 try: 51 set 52 except NameError: 53 # Python 2.3 54 from sets import Set as set 55 try: 56 bytes 57 except NameError: 58 # Python < 2.6 59 bytes = str 60 try: 61 unicode 62 except NameError: 63 # Python 3 64 unicode = str 65 try: 66 basestring 67 except NameError: 68 # Python 3 69 basestring = (str, bytes) 70

71 -def __fix_docstring(s):

72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)

80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) 100 _css_import_re = re.compile(r'@import "(.*?)"') 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104

105 -def _unquote_match(s, pos):

106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos

110

111 -def _transform_result(typ, result):

112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding=unicode) 118 else: 119 return result

120

121 -def _nons(tag):

122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag

126

127 -class HtmlMixin(object):

128

129 - def base_url(self):

130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL

137 base_url = property(base_url, doc=base_url.__doc__) 138

139 - def forms(self):

140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)

144 forms = property(forms, doc=forms.__doc__) 145

146 - def body(self):

147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

152 body = property(body, doc=body.__doc__) 153

154 - def head(self):

155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

160 head = property(head, doc=head.__doc__) 161

162 - def _label__get(self):

163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]

174 - def _label__set(self, label):

175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)

185 - def _label__del(self):

186 label = self.label 187 if label is not None: 188 del label.attrib['for']

189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190

191 - def drop_tree(self):

192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)

206

207 - def drop_tag(self):

208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding=unicode)) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]

238

239 - def find_rel_links(self, rel):

240 """ 241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 242 """ 243 rel = rel.lower() 244 return [el for el in _rel_links_xpath(self) 245 if el.get('rel').lower() == rel]

246

247 - def find_class(self, class_name):

248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)

252

253 - def get_element_by_id(self, id, *default):

254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)

273

274 - def text_content(self):

275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)

279

280 - def cssselect(self, expr, translator='html'):

281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)

292 293 ######################################## 294 ## Link functions 295 ######################################## 296

297 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

298 """ 299 Make all links in the document absolute, given the 300 ``base_url`` for the document (the full URL where the document 301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 302 303 If ``resolve_base_href`` is true, then any ``<base href>`` 304 tags in the document are used *and* removed from the document. 305 If it is false then any such tag is ignored. 306 """ 307 if base_url is None: 308 base_url = self.base_url 309 if base_url is None: 310 raise TypeError( 311 "No base_url given, and the document has no base_url") 312 if resolve_base_href: 313 self.resolve_base_href() 314 def link_repl(href): 315 return urljoin(base_url, href)

316 self.rewrite_links(link_repl)

317

318 - def resolve_base_href(self):

319 """ 320 Find any ``<base href>`` tag in the document, and apply its 321 values to all links found in the document. Also remove the 322 tag once it has been applied. 323 """ 324 base_href = None 325 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 326 for b in basetags: 327 base_href = b.get('href') 328 b.drop_tree() 329 if not base_href: 330 return 331 self.make_links_absolute(base_href, resolve_base_href=False)

332

333 - def iterlinks(self):

334 """ 335 Yield (element, attribute, link, pos), where attribute may be None 336 (indicating the link is in the text). ``pos`` is the position 337 where the link occurs; often 0, but sometimes something else in 338 the case of links in stylesheets or style tags. 339 340 Note: <base href> is *not* taken into account in any way. The 341 link you get is exactly the link in the document. 342 343 Note: multiple links inside of a single text string or 344 attribute value are returned in reversed order. This makes it 345 possible to replace or delete them from the text string value 346 based on their reported text positions. Otherwise, a 347 modification at one text position can change the positions of 348 links reported later on. 349 """ 350 link_attrs = defs.link_attrs 351 for el in self.iter(): 352 attribs = el.attrib 353 tag = _nons(el.tag) 354 if tag != 'object': 355 for attrib in link_attrs: 356 if attrib in attribs: 357 yield (el, attrib, attribs[attrib], 0) 358 elif tag == 'object': 359 codebase = None 360 ## <object> tags have attributes that are relative to 361 ## codebase 362 if 'codebase' in attribs: 363 codebase = el.get('codebase') 364 yield (el, 'codebase', codebase, 0) 365 for attrib in 'classid', 'data': 366 if attrib in attribs: 367 value = el.get(attrib) 368 if codebase is not None: 369 value = urljoin(codebase, value) 370 yield (el, attrib, value, 0) 371 if 'archive' in attribs: 372 for match in _archive_re.finditer(el.get('archive')): 373 value = match.group(0) 374 if codebase is not None: 375 value = urljoin(codebase, value) 376 yield (el, 'archive', value, match.start()) 377 if tag == 'param': 378 valuetype = el.get('valuetype') or '' 379 if valuetype.lower() == 'ref': 380 ## FIXME: while it's fine we *find* this link, 381 ## according to the spec we aren't supposed to 382 ## actually change the value, including resolving 383 ## it. It can also still be a link, even if it 384 ## doesn't have a valuetype="ref" (which seems to be the norm) 385 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 386 yield (el, 'value', el.get('value'), 0) 387 if tag == 'style' and el.text: 388 urls = [ 389 _unquote_match(match.group(1), match.start(1)) 390 for match in _css_url_re.finditer(el.text) 391 ] + [ 392 (match.group(1), match.start(1)) 393 for match in _css_import_re.finditer(el.text) 394 ] 395 if urls: 396 # sort by start pos to bring both match sets back into order 397 urls = [ (start, url) for (url, start) in urls ] 398 urls.sort() 399 # reverse the list to report correct positions despite 400 # modifications 401 urls.reverse() 402 for start, url in urls: 403 yield (el, None, url, start) 404 if 'style' in attribs: 405 urls = list(_css_url_re.finditer(attribs['style'])) 406 if urls: 407 # return in reversed order to simplify in-place modifications 408 for match in urls[::-1]: 409 url, start = _unquote_match(match.group(1), match.start(1)) 410 yield (el, 'style', url, start)

411

412 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 413 base_href=None):

414 """ 415 Rewrite all the links in the document. For each link 416 ``link_repl_func(link)`` will be called, and the return value 417 will replace the old link. 418 419 Note that links may not be absolute (unless you first called 420 ``make_links_absolute()``), and may be internal (e.g., 421 ``'#anchor'``). They can also be values like 422 ``'mailto:email'`` or ``'javascript:expr'``. 423 424 If you give ``base_href`` then all links passed to 425 ``link_repl_func()`` will take that into account. 426 427 If the ``link_repl_func`` returns None, the attribute or 428 tag text will be removed completely. 429 """ 430 if base_href is not None: 431 # FIXME: this can be done in one pass with a wrapper 432 # around link_repl_func 433 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 434 elif resolve_base_href: 435 self.resolve_base_href() 436 for el, attrib, link, pos in self.iterlinks(): 437 new_link = link_repl_func(link.strip()) 438 if new_link == link: 439 continue 440 if new_link is None: 441 # Remove the attribute or element content 442 if attrib is None: 443 el.text = '' 444 else: 445 del el.attrib[attrib] 446 continue 447 if attrib is None: 448 new = el.text[:pos] + new_link + el.text[pos+len(link):] 449 el.text = new 450 else: 451 cur = el.attrib[attrib] 452 if not pos and len(cur) == len(link): 453 # Most common case 454 el.attrib[attrib] = new_link 455 else: 456 new = cur[:pos] + new_link + cur[pos+len(link):] 457 el.attrib[attrib] = new

458 459

460 -class _MethodFunc(object):

461 """ 462 An object that represents a method on an element as a function; 463 the function takes either an element or an HTML string. It 464 returns whatever the function normally returns, or if the function 465 works in-place (and so returns None) it returns a serialized form 466 of the resulting document. 467 """

468 - def __init__(self, name, copy=False, source_class=HtmlMixin):

469 self.name = name 470 self.copy = copy 471 self.__doc__ = getattr(source_class, self.name).__doc__

472 - def __call__(self, doc, *args, **kw):

473 result_type = type(doc) 474 if isinstance(doc, basestring): 475 if 'copy' in kw: 476 raise TypeError( 477 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 478 doc = fromstring(doc, **kw) 479 else: 480 if 'copy' in kw: 481 make_a_copy = kw.pop('copy') 482 else: 483 make_a_copy = self.copy 484 if make_a_copy: 485 doc = copy.deepcopy(doc) 486 meth = getattr(doc, self.name) 487 result = meth(*args, **kw) 488 # FIXME: this None test is a bit sloppy 489 if result is None: 490 # Then return what we got in 491 return _transform_result(result_type, doc) 492 else: 493 return result

494 495 find_rel_links = _MethodFunc('find_rel_links', copy=False) 496 find_class = _MethodFunc('find_class', copy=False) 497 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 498 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 499 iterlinks = _MethodFunc('iterlinks', copy=False) 500 rewrite_links = _MethodFunc('rewrite_links', copy=True) 501

502 -class HtmlComment(etree.CommentBase, HtmlMixin):

503 pass

504

505 -class HtmlElement(etree.ElementBase, HtmlMixin):

506 pass

507

508 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

509 pass

510

511 -class HtmlEntity(etree.EntityBase, HtmlMixin):

512 pass

513 514

515 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

516 """A lookup scheme for HTML Element classes. 517 518 To create a lookup instance with different Element classes, pass a tag 519 name mapping of Element classes in the ``classes`` keyword argument and/or 520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 521 The special key '*' denotes a Mixin class that should be mixed into all 522 Element classes. 523 """ 524 _default_element_classes = {} 525

526 - def __init__(self, classes=None, mixins=None):

527 etree.CustomElementClassLookup.__init__(self) 528 if classes is None: 529 classes = self._default_element_classes.copy() 530 if mixins: 531 mixers = {} 532 for name, value in mixins: 533 if name == '*': 534 for n in classes.keys(): 535 mixers.setdefault(n, []).append(value) 536 else: 537 mixers.setdefault(name, []).append(value) 538 for name, mix_bases in mixers.items(): 539 cur = classes.get(name, HtmlElement) 540 bases = tuple(mix_bases + [cur]) 541 classes[name] = type(cur.__name__, bases, {}) 542 self._element_classes = classes

543

544 - def lookup(self, node_type, document, namespace, name):

545 if node_type == 'element': 546 return self._element_classes.get(name.lower(), HtmlElement) 547 elif node_type == 'comment': 548 return HtmlComment 549 elif node_type == 'PI': 550 return HtmlProcessingInstruction 551 elif node_type == 'entity': 552 return HtmlEntity 553 # Otherwise normal lookup 554 return None

555 556 ################################################################################ 557 # parsing 558 ################################################################################ 559

560 -def document_fromstring(html, parser=None, **kw):

561 if parser is None: 562 parser = html_parser 563 value = etree.fromstring(html, parser, **kw) 564 if value is None: 565 raise etree.ParserError( 566 "Document is empty") 567 return value

568

569 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 570 parser=None, **kw):

571 """ 572 Parses several HTML elements, returning a list of elements. 573 574 The first item in the list may be a string (though leading 575 whitespace is removed). If no_leading_text is true, then it will 576 be an error if there is leading text, and it will always be a list 577 of only elements. 578 579 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 580 """ 581 if parser is None: 582 parser = html_parser 583 # FIXME: check what happens when you give html with a body, head, etc. 584 start = html[:20].lstrip().lower() 585 if not start.startswith('<html') and not start.startswith('<!doctype'): 586 html = '<html><body>%s</body></html>' % html 587 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 588 assert _nons(doc.tag) == 'html' 589 bodies = [e for e in doc if _nons(e.tag) == 'body'] 590 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 591 body = bodies[0] 592 elements = [] 593 if no_leading_text and body.text and body.text.strip(): 594 raise etree.ParserError( 595 "There is leading text: %r" % body.text) 596 if body.text and body.text.strip(): 597 elements.append(body.text) 598 elements.extend(body) 599 # FIXME: removing the reference to the parent artificial document 600 # would be nice 601 return elements

602

603 -def fragment_fromstring(html, create_parent=False, base_url=None, 604 parser=None, **kw):

605 """ 606 Parses a single HTML element; it is an error if there is more than 607 one element, or if anything but whitespace precedes or follows the 608 element. 609 610 If create_parent is true (or is a tag name) then a parent node 611 will be created to encapsulate the HTML in a single element. In 612 this case, leading or trailing text is allowed. 613 614 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 615 """ 616 if parser is None: 617 parser = html_parser 618 619 accept_leading_text = bool(create_parent) 620 621 elements = fragments_fromstring( 622 html, parser=parser, no_leading_text=not accept_leading_text, 623 base_url=base_url, **kw) 624 625 if create_parent: 626 if not isinstance(create_parent, basestring): 627 create_parent = 'div' 628 new_root = Element(create_parent) 629 if elements: 630 if isinstance(elements[0], basestring): 631 new_root.text = elements[0] 632 del elements[0] 633 new_root.extend(elements) 634 return new_root 635 636 if not elements: 637 raise etree.ParserError('No elements found') 638 if len(elements) > 1: 639 raise etree.ParserError( 640 "Multiple elements found (%s)" 641 % ', '.join([_element_name(e) for e in elements])) 642 el = elements[0] 643 if el.tail and el.tail.strip(): 644 raise etree.ParserError( 645 "Element followed by text: %r" % el.tail) 646 el.tail = None 647 return el

648

649 -def fromstring(html, base_url=None, parser=None, **kw):

650 """ 651 Parse the html, returning a single element/document. 652 653 This tries to minimally parse the chunk of text, without knowing if it 654 is a fragment or a document. 655 656 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 657 """ 658 if parser is None: 659 parser = html_parser 660 start = html[:10].lstrip().lower() 661 if start.startswith('<html') or start.startswith('<!doctype'): 662 # Looks like a full HTML document 663 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 664 # otherwise, lets parse it out... 665 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 666 bodies = doc.findall('body') 667 if not bodies: 668 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 669 if bodies: 670 body = bodies[0] 671 if len(bodies) > 1: 672 # Somehow there are multiple bodies, which is bad, but just 673 # smash them into one body 674 for other_body in bodies[1:]: 675 if other_body.text: 676 if len(body): 677 body[-1].tail = (body[-1].tail or '') + other_body.text 678 else: 679 body.text = (body.text or '') + other_body.text 680 body.extend(other_body) 681 # We'll ignore tail 682 # I guess we are ignoring attributes too 683 other_body.drop_tree() 684 else: 685 body = None 686 heads = doc.findall('head') 687 if not heads: 688 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 689 if heads: 690 # Well, we have some sort of structure, so lets keep it all 691 head = heads[0] 692 if len(heads) > 1: 693 for other_head in heads[1:]: 694 head.extend(other_head) 695 # We don't care about text or tail in a head 696 other_head.drop_tree() 697 return doc 698 if (len(body) == 1 and (not body.text or not body.text.strip()) 699 and (not body[-1].tail or not body[-1].tail.strip())): 700 # The body has just one element, so it was probably a single 701 # element passed in 702 return body[0] 703 # Now we have a body which represents a bunch of tags which have the 704 # content that was passed in. We will create a fake container, which 705 # is the body tag, except <body> implies too much structure. 706 if _contains_block_level_tag(body): 707 body.tag = 'div' 708 else: 709 body.tag = 'span' 710 return body

711

712 -def parse(filename_or_url, parser=None, base_url=None, **kw):

713 """ 714 Parse a filename, URL, or file-like object into an HTML document 715 tree. Note: this returns a tree, not an element. Use 716 ``parse(...).getroot()`` to get the document root. 717 718 You can override the base URL with the ``base_url`` keyword. This 719 is most useful when parsing from a file-like object. 720 """ 721 if parser is None: 722 parser = html_parser 723 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

724

725 -def _contains_block_level_tag(el):

726 # FIXME: I could do this with XPath, but would that just be 727 # unnecessarily slow? 728 for el in el.iter(): 729 if _nons(el.tag) in defs.block_tags: 730 return True 731 return False

732

733 -def _element_name(el):

734 if isinstance(el, etree.CommentBase): 735 return 'comment' 736 elif isinstance(el, basestring): 737 return 'string' 738 else: 739 return _nons(el.tag)

740 741 ################################################################################ 742 # form handling 743 ################################################################################ 744

745 -class FormElement(HtmlElement):

746 """ 747 Represents a <form> element. 748 """ 749

750 - def inputs(self):

751 """ 752 Returns an accessor for all the input elements in the form. 753 754 See `InputGetter` for more information about the object. 755 """ 756 return InputGetter(self)

757 inputs = property(inputs, doc=inputs.__doc__) 758

759 - def _fields__get(self):

760 """ 761 Dictionary-like object that represents all the fields in this 762 form. You can set values in this dictionary to effect the 763 form. 764 """ 765 return FieldsDict(self.inputs)

766 - def _fields__set(self, value):

767 prev_keys = self.fields.keys() 768 for key, value in value.iteritems(): 769 if key in prev_keys: 770 prev_keys.remove(key) 771 self.fields[key] = value 772 for key in prev_keys: 773 if key is None: 774 # Case of an unnamed input; these aren't really 775 # expressed in form_values() anyway. 776 continue 777 self.fields[key] = None

778 779 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 780

781 - def _name(self):

782 if self.get('name'): 783 return self.get('name') 784 elif self.get('id'): 785 return '#' + self.get('id') 786 forms = list(self.body.iter('form')) 787 if not forms: 788 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 789 return str(forms.index(self))

790

791 - def form_values(self):

792 """ 793 Return a list of tuples of the field values for the form. 794 This is suitable to be passed to ``urllib.urlencode()``. 795 """ 796 results = [] 797 for el in self.inputs: 798 name = el.name 799 if not name: 800 continue 801 tag = _nons(el.tag) 802 if tag == 'textarea': 803 results.append((name, el.value)) 804 elif tag == 'select': 805 value = el.value 806 if el.multiple: 807 for v in value: 808 results.append((name, v)) 809 elif value is not None: 810 results.append((name, el.value)) 811 else: 812 assert tag == 'input', ( 813 "Unexpected tag: %r" % el) 814 if el.checkable and not el.checked: 815 continue 816 if el.type in ('submit', 'image', 'reset'): 817 continue 818 value = el.value 819 if value is not None: 820 results.append((name, el.value)) 821 return results

822

823 - def _action__get(self):

824 """ 825 Get/set the form's ``action`` attribute. 826 """ 827 base_url = self.base_url 828 action = self.get('action') 829 if base_url and action is not None: 830 return urljoin(base_url, action) 831 else: 832 return action

833 - def _action__set(self, value):

834 self.set('action', value)

835 - def _action__del(self):

836 if 'action' in self.attrib: 837 del self.attrib['action']

838 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 839

840 - def _method__get(self):

841 """ 842 Get/set the form's method. Always returns a capitalized 843 string, and defaults to ``'GET'`` 844 """ 845 return self.get('method', 'GET').upper()

846 - def _method__set(self, value):

847 self.set('method', value.upper())

848 method = property(_method__get, _method__set, doc=_method__get.__doc__)

849 850 HtmlElementClassLookup._default_element_classes['form'] = FormElement 851

852 -def submit_form(form, extra_values=None, open_http=None):

853 """ 854 Helper function to submit a form. Returns a file-like object, as from 855 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 856 which shows the URL if there were any redirects. 857 858 You can use this like:: 859 860 form = doc.forms[0] 861 form.inputs['foo'].value = 'bar' # etc 862 response = form.submit() 863 doc = parse(response) 864 doc.make_links_absolute(response.geturl()) 865 866 To change the HTTP requester, pass a function as ``open_http`` keyword 867 argument that opens the URL for you. The function must have the following 868 signature:: 869 870 open_http(method, URL, values) 871 872 The action is one of 'GET' or 'POST', the URL is the target URL as a 873 string, and the values are a sequence of ``(name, value)`` tuples with the 874 form data. 875 """ 876 values = form.form_values() 877 if extra_values: 878 if hasattr(extra_values, 'items'): 879 extra_values = extra_values.items() 880 values.extend(extra_values) 881 if open_http is None: 882 open_http = open_http_urllib 883 if form.action: 884 url = form.action 885 else: 886 url = form.base_url 887 return open_http(form.method, url, values)

888

889 -def open_http_urllib(method, url, values):

890 if not url: 891 raise ValueError("cannot submit, no URL provided") 892 ## FIXME: should test that it's not a relative URL or something 893 try: 894 from urllib import urlencode, urlopen 895 except ImportError: # Python 3 896 from urllib.request import urlopen 897 from urllib.parse import urlencode 898 if method == 'GET': 899 if '?' in url: 900 url += '&' 901 else: 902 url += '?' 903 url += urlencode(values) 904 data = None 905 else: 906 data = urlencode(values) 907 return urlopen(url, data)

908

909 -class FieldsDict(DictMixin):

910

911 - def __init__(self, inputs):

912 self.inputs = inputs

913 - def __getitem__(self, item):

914 return self.inputs[item].value

915 - def __setitem__(self, item, value):

916 self.inputs[item].value = value

917 - def __delitem__(self, item):

918 raise KeyError( 919 "You cannot remove keys from ElementDict")

920 - def keys(self):

921 return self.inputs.keys()

922 - def __contains__(self, item):

923 return item in self.inputs

924 - def __iter__(self):

925 return iter(self.inputs.keys())

926 - def __len__(self):

927 return len(self.inputs)

928

929 - def __repr__(self):

930 return '<%s for form %s>' % ( 931 self.__class__.__name__, 932 self.inputs.form._name())

933

934 -class InputGetter(object):

935 936 """ 937 An accessor that represents all the input fields in a form. 938 939 You can get fields by name from this, with 940 ``form.inputs['field_name']``. If there are a set of checkboxes 941 with the same name, they are returned as a list (a `CheckboxGroup` 942 which also allows value setting). Radio inputs are handled 943 similarly. 944 945 You can also iterate over this to get all input elements. This 946 won't return the same thing as if you get all the names, as 947 checkboxes and radio elements are returned individually. 948 """ 949 950 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 951 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 952

953 - def __init__(self, form):

954 self.form = form

955

956 - def __repr__(self):

957 return '<%s for form %s>' % ( 958 self.__class__.__name__, 959 self.form._name())

960 961 ## FIXME: there should be more methods, and it's unclear if this is 962 ## a dictionary-like object or list-like object 963

964 - def __getitem__(self, name):

965 results = self._name_xpath(self.form, name=name) 966 if results: 967 type = results[0].get('type') 968 if type == 'radio' and len(results) > 1: 969 group = RadioGroup(results) 970 group.name = name 971 return group 972 elif type == 'checkbox' and len(results) > 1: 973 group = CheckboxGroup(results) 974 group.name = name 975 return group 976 else: 977 # I don't like throwing away elements like this 978 return results[0] 979 else: 980 raise KeyError( 981 "No input element with the name %r" % name)

982

983 - def __contains__(self, name):

984 results = self._name_xpath(self.form, name=name) 985 return bool(results)

986

987 - def keys(self):

988 names = set() 989 for el in self: 990 names.add(el.name) 991 if None in names: 992 names.remove(None) 993 return list(names)

994

995 - def __iter__(self):

996 ## FIXME: kind of dumb to turn a list into an iterator, only 997 ## to have it likely turned back into a list again :( 998 return iter(self._all_xpath(self.form))

999

1000 -class InputMixin(object):

1001 1002 """ 1003 Mix-in for all input elements (input, select, and textarea) 1004 """ 1005 1006

1007 - def _name__get(self):

1008 """ 1009 Get/set the name of the element 1010 """ 1011 return self.get('name')

1012 - def _name__set(self, value):

1013 self.set('name', value)

1014 - def _name__del(self):

1015 if 'name' in self.attrib: 1016 del self.attrib['name']

1017 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1018

1019 - def __repr__(self):

1020 type = getattr(self, 'type', None) 1021 if type: 1022 type = ' type=%r' % type 1023 else: 1024 type = '' 1025 return '<%s %x name=%r%s>' % ( 1026 self.__class__.__name__, id(self), self.name, type)

1027

1028 -class TextareaElement(InputMixin, HtmlElement):

1029 """ 1030 ``<textarea>`` element. You can get the name with ``.name`` and 1031 get/set the value with ``.value`` 1032 """ 1033

1034 - def _value__get(self):

1035 """ 1036 Get/set the value (which is the contents of this element) 1037 """ 1038 content = self.text or '' 1039 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1040 serialisation_method = 'xml' 1041 else: 1042 serialisation_method = 'html' 1043 for el in self: 1044 # it's rare that we actually get here, so let's not use ''.join() 1045 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1046 return content

1047 - def _value__set(self, value):

1048 del self[:] 1049 self.text = value

1050 - def _value__del(self):

1051 self.text = '' 1052 del self[:]

1053 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

1054 1055 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1056

1057 -class SelectElement(InputMixin, HtmlElement):

1058 """ 1059 ``<select>`` element. You can get the name with ``.name``. 1060 1061 ``.value`` will be the value of the selected option, unless this 1062 is a multi-select element (``<select multiple>``), in which case 1063 it will be a set-like object. In either case ``.value_options`` 1064 gives the possible values. 1065 1066 The boolean attribute ``.multiple`` shows if this is a 1067 multi-select. 1068 """ 1069

1070 - def _value__get(self):

1071 """ 1072 Get/set the value of this select (the selected option). 1073 1074 If this is a multi-select, this is a set-like object that 1075 represents all the selected options. 1076 """ 1077 if self.multiple: 1078 return MultipleSelectOptions(self) 1079 for el in _options_xpath(self): 1080 if el.get('selected') is not None: 1081 value = el.get('value') 1082 if value is None: 1083 value = el.text or '' 1084 if value: 1085 value = value.strip() 1086 return value 1087 return None

1088

1089 - def _value__set(self, value):

1090 if self.multiple: 1091 if isinstance(value, basestring): 1092 raise TypeError( 1093 "You must pass in a sequence") 1094 self.value.clear() 1095 self.value.update(value) 1096 return 1097 if value is not None: 1098 value = value.strip() 1099 for el in _options_xpath(self): 1100 opt_value = el.get('value') 1101 if opt_value is None: 1102 opt_value = el.text or '' 1103 if opt_value: 1104 opt_value = opt_value.strip() 1105 if opt_value == value: 1106 checked_option = el 1107 break 1108 else: 1109 raise ValueError( 1110 "There is no option with the value of %r" % value) 1111 for el in _options_xpath(self): 1112 if 'selected' in el.attrib: 1113 del el.attrib['selected'] 1114 if value is not None: 1115 checked_option.set('selected', '')

1116

1117 - def _value__del(self):

1118 # FIXME: should del be allowed at all? 1119 if self.multiple: 1120 self.value.clear() 1121 else: 1122 self.value = None

1123 1124 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1125

1126 - def value_options(self):

1127 """ 1128 All the possible values this select can have (the ``value`` 1129 attribute of all the ``<option>`` elements. 1130 """ 1131 options = [] 1132 for el in _options_xpath(self): 1133 value = el.get('value') 1134 if value is None: 1135 value = el.text or '' 1136 if value: 1137 value = value.strip() 1138 options.append(value) 1139 return options

1140 value_options = property(value_options, doc=value_options.__doc__) 1141

1142 - def _multiple__get(self):

1143 """ 1144 Boolean attribute: is there a ``multiple`` attribute on this element. 1145 """ 1146 return 'multiple' in self.attrib

1147 - def _multiple__set(self, value):

1148 if value: 1149 self.set('multiple', '') 1150 elif 'multiple' in self.attrib: 1151 del self.attrib['multiple']

1152 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1153 1154 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1155

1156 -class MultipleSelectOptions(SetMixin):

1157 """ 1158 Represents all the selected options in a ``<select multiple>`` element. 1159 1160 You can add to this set-like option to select an option, or remove 1161 to unselect the option. 1162 """ 1163

1164 - def __init__(self, select):

1165 self.select = select

1166

1167 - def options(self):

1168 """ 1169 Iterator of all the ``<option>`` elements. 1170 """ 1171 return iter(_options_xpath(self.select))

1172 options = property(options) 1173

1174 - def __iter__(self):

1175 for option in self.options: 1176 if 'selected' in option.attrib: 1177 opt_value = option.get('value') 1178 if opt_value is None: 1179 opt_value = option.text or '' 1180 if opt_value: 1181 opt_value = opt_value.strip() 1182 yield opt_value

1183

1184 - def add(self, item):

1185 for option in self.options: 1186 opt_value = option.get('value') 1187 if opt_value is None: 1188 opt_value = option.text or '' 1189 if opt_value: 1190 opt_value = opt_value.strip() 1191 if opt_value == item: 1192 option.set('selected', '') 1193 break 1194 else: 1195 raise ValueError( 1196 "There is no option with the value %r" % item)

1197

1198 - def remove(self, item):

1199 for option in self.options: 1200 opt_value = option.get('value') 1201 if opt_value is None: 1202 opt_value = option.text or '' 1203 if opt_value: 1204 opt_value = opt_value.strip() 1205 if opt_value == item: 1206 if 'selected' in option.attrib: 1207 del option.attrib['selected'] 1208 else: 1209 raise ValueError( 1210 "The option %r is not currently selected" % item) 1211 break 1212 else: 1213 raise ValueError( 1214 "There is not option with the value %r" % item)

1215

1216 - def __repr__(self):

1217 return '<%s {%s} for select name=%r>' % ( 1218 self.__class__.__name__, 1219 ', '.join([repr(v) for v in self]), 1220 self.select.name)

1221

1222 -class RadioGroup(list):

1223 """ 1224 This object represents several ``<input type=radio>`` elements 1225 that have the same name. 1226 1227 You can use this like a list, but also use the property 1228 ``.value`` to check/uncheck inputs. Also you can use 1229 ``.value_options`` to get the possible values. 1230 """ 1231

1232 - def _value__get(self):

1233 """ 1234 Get/set the value, which checks the radio with that value (and 1235 unchecks any other value). 1236 """ 1237 for el in self: 1238 if 'checked' in el.attrib: 1239 return el.get('value') 1240 return None

1241

1242 - def _value__set(self, value):

1243 if value is not None: 1244 for el in self: 1245 if el.get('value') == value: 1246 checked_option = el 1247 break 1248 else: 1249 raise ValueError( 1250 "There is no radio input with the value %r" % value) 1251 for el in self: 1252 if 'checked' in el.attrib: 1253 del el.attrib['checked'] 1254 if value is not None: 1255 checked_option.set('checked', '')

1256

1257 - def _value__del(self):

1258 self.value = None

1259 1260 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1261

1262 - def value_options(self):

1263 """ 1264 Returns a list of all the possible values. 1265 """ 1266 return [el.get('value') for el in self]

1267 value_options = property(value_options, doc=value_options.__doc__) 1268

1269 - def __repr__(self):

1270 return '%s(%s)' % ( 1271 self.__class__.__name__, 1272 list.__repr__(self))

1273

1274 -class CheckboxGroup(list):

1275 """ 1276 Represents a group of checkboxes (``<input type=checkbox>``) that 1277 have the same name. 1278 1279 In addition to using this like a list, the ``.value`` attribute 1280 returns a set-like object that you can add to or remove from to 1281 check and uncheck checkboxes. You can also use ``.value_options`` 1282 to get the possible values. 1283 """ 1284

1285 - def _value__get(self):

1286 """ 1287 Return a set-like object that can be modified to check or 1288 uncheck individual checkboxes according to their value. 1289 """ 1290 return CheckboxValues(self)

1291 - def _value__set(self, value):

1292 self.value.clear() 1293 if not hasattr(value, '__iter__'): 1294 raise ValueError( 1295 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1296 % (self[0].name, value)) 1297 self.value.update(value)

1298 - def _value__del(self):

1299 self.value.clear()

1300 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1301

1302 - def value_options(self):

1303 """ 1304 Returns a list of all the possible values. 1305 """ 1306 return [el.get('value') for el in self]

1307 value_options = property(value_options, doc=value_options.__doc__) 1308

1309 - def __repr__(self):

1310 return '%s(%s)' % ( 1311 self.__class__.__name__, list.__repr__(self))

1312

1313 -class CheckboxValues(SetMixin):

1314 1315 """ 1316 Represents the values of the checked checkboxes in a group of 1317 checkboxes with the same name. 1318 """ 1319

1320 - def __init__(self, group):

1321 self.group = group

1322

1323 - def __iter__(self):

1324 return iter([ 1325 el.get('value') 1326 for el in self.group 1327 if 'checked' in el.attrib])

1328

1329 - def add(self, value):

1330 for el in self.group: 1331 if el.get('value') == value: 1332 el.set('checked', '') 1333 break 1334 else: 1335 raise KeyError("No checkbox with value %r" % value)

1336

1337 - def remove(self, value):

1338 for el in self.group: 1339 if el.get('value') == value: 1340 if 'checked' in el.attrib: 1341 del el.attrib['checked'] 1342 else: 1343 raise KeyError( 1344 "The checkbox with value %r was already unchecked" % value) 1345 break 1346 else: 1347 raise KeyError( 1348 "No checkbox with value %r" % value)

1349

1350 - def __repr__(self):

1351 return '<%s {%s} for checkboxes name=%r>' % ( 1352 self.__class__.__name__, 1353 ', '.join([repr(v) for v in self]), 1354 self.group.name)

1355

1356 -class InputElement(InputMixin, HtmlElement):

1357 """ 1358 Represents an ``<input>`` element. 1359 1360 You can get the type with ``.type`` (which is lower-cased and 1361 defaults to ``'text'``). 1362 1363 Also you can get and set the value with ``.value`` 1364 1365 Checkboxes and radios have the attribute ``input.checkable == 1366 True`` (for all others it is false) and a boolean attribute 1367 ``.checked``. 1368 1369 """ 1370 1371 ## FIXME: I'm a little uncomfortable with the use of .checked

1372 - def _value__get(self):

1373 """ 1374 Get/set the value of this element, using the ``value`` attribute. 1375 1376 Also, if this is a checkbox and it has no value, this defaults 1377 to ``'on'``. If it is a checkbox or radio that is not 1378 checked, this returns None. 1379 """ 1380 if self.checkable: 1381 if self.checked: 1382 return self.get('value') or 'on' 1383 else: 1384 return None 1385 return self.get('value')

1386 - def _value__set(self, value):

1387 if self.checkable: 1388 if not value: 1389 self.checked = False 1390 else: 1391 self.checked = True 1392 if isinstance(value, basestring): 1393 self.set('value', value) 1394 else: 1395 self.set('value', value)

1396 - def _value__del(self):

1397 if self.checkable: 1398 self.checked = False 1399 else: 1400 if 'value' in self.attrib: 1401 del self.attrib['value']

1402 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1403

1404 - def _type__get(self):

1405 """ 1406 Return the type of this element (using the type attribute). 1407 """ 1408 return self.get('type', 'text').lower()

1409 - def _type__set(self, value):

1410 self.set('type', value)

1411 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1412

1413 - def checkable(self):

1414 """ 1415 Boolean: can this element be checked? 1416 """ 1417 return self.type in ['checkbox', 'radio']

1418 checkable = property(checkable, doc=checkable.__doc__) 1419

1420 - def _checked__get(self):

1421 """ 1422 Boolean attribute to get/set the presence of the ``checked`` 1423 attribute. 1424 1425 You can only use this on checkable input types. 1426 """ 1427 if not self.checkable: 1428 raise AttributeError('Not a checkable input type') 1429 return 'checked' in self.attrib

1430 - def _checked__set(self, value):

1431 if not self.checkable: 1432 raise AttributeError('Not a checkable input type') 1433 if value: 1434 self.set('checked', '') 1435 else: 1436 if 'checked' in self.attrib: 1437 del self.attrib['checked']

1438 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1439 1440 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1441

1442 -class LabelElement(HtmlElement):

1443 """ 1444 Represents a ``<label>`` element. 1445 1446 Label elements are linked to other elements with their ``for`` 1447 attribute. You can access this element with ``label.for_element``. 1448 """ 1449

1450 - def _for_element__get(self):

1451 """ 1452 Get/set the element this label points to. Return None if it 1453 can't be found. 1454 """ 1455 id = self.get('for') 1456 if not id: 1457 return None 1458 return self.body.get_element_by_id(id)

1459 - def _for_element__set(self, other):

1460 id = other.get('id') 1461 if not id: 1462 raise TypeError( 1463 "Element %r has no id attribute" % other) 1464 self.set('for', id)

1465 - def _for_element__del(self):

1466 if 'id' in self.attrib: 1467 del self.attrib['id']

1468 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1469 doc=_for_element__get.__doc__)

1470 1471 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1472 1473 ############################################################ 1474 ## Serialization 1475 ############################################################ 1476

1477 -def html_to_xhtml(html):

1478 """Convert all tags in an HTML tree to XHTML by moving them to the 1479 XHTML namespace. 1480 """ 1481 try: 1482 html = html.getroot() 1483 except AttributeError: 1484 pass 1485 prefix = "{%s}" % XHTML_NAMESPACE 1486 for el in html.iter(): 1487 tag = el.tag 1488 if isinstance(tag, basestring): 1489 if tag[0] != '{': 1490 el.tag = prefix + tag

1491

1492 -def xhtml_to_html(xhtml):

1493 """Convert all tags in an XHTML tree to HTML by removing their 1494 XHTML namespace. 1495 """ 1496 try: 1497 xhtml = xhtml.getroot() 1498 except AttributeError: 1499 pass 1500 prefix = "{%s}" % XHTML_NAMESPACE 1501 prefix_len = len(prefix) 1502 for el in xhtml.iter(prefix + "*"): 1503 el.tag = el.tag[prefix_len:]

1504 1505 # This isn't a general match, but it's a match for what libxml2 1506 # specifically serialises: 1507 __str_replace_meta_content_type = re.compile( 1508 r'<meta http-equiv="Content-Type"[^>]*>').sub 1509 __bytes_replace_meta_content_type = re.compile( 1510 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1511

1512 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1513 encoding=None, method="html", with_tail=True, doctype=None):

1514 """Return an HTML string representation of the document. 1515 1516 Note: if include_meta_content_type is true this will create a 1517 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1518 regardless of the value of include_meta_content_type any existing 1519 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1520 1521 The ``encoding`` argument controls the output encoding (defauts to 1522 ASCII, with &#...; character references for any characters outside 1523 of ASCII). Note that you can pass the name ``'unicode'`` as 1524 ``encoding`` argument to serialise to a unicode string. 1525 1526 The ``method`` argument defines the output method. It defaults to 1527 'html', but can also be 'xml' for xhtml output, or 'text' to 1528 serialise to plain text without markup. 1529 1530 To leave out the tail text of the top-level element that is being 1531 serialised, pass ``with_tail=False``. 1532 1533 The ``doctype`` option allows passing in a plain string that will 1534 be serialised before the XML tree. Note that passing in non 1535 well-formed content here will make the XML output non well-formed. 1536 Also, an existing doctype in the document tree will not be removed 1537 when serialising an ElementTree instance. 1538 1539 Example:: 1540 1541 >>> from lxml import html 1542 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1543 1544 >>> html.tostring(root) 1545 b'<p>Hello<br>world!</p>' 1546 >>> html.tostring(root, method='html') 1547 b'<p>Hello<br>world!</p>' 1548 1549 >>> html.tostring(root, method='xml') 1550 b'<p>Hello<br/>world!</p>' 1551 1552 >>> html.tostring(root, method='text') 1553 b'Helloworld!' 1554 1555 >>> html.tostring(root, method='text', encoding=unicode) 1556 u'Helloworld!' 1557 1558 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1559 >>> html.tostring(root[0], method='text', encoding=unicode) 1560 u'Helloworld!TAIL' 1561 1562 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 1563 u'Helloworld!' 1564 1565 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1566 >>> html.tostring(doc, method='html', encoding=unicode) 1567 u'<html><body><p>Hello<br>world!</p></body></html>' 1568 1569 >>> print(html.tostring(doc, method='html', encoding=unicode, 1570 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1571 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1572 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1573 <html><body><p>Hello<br>world!</p></body></html> 1574 """ 1575 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1576 encoding=encoding, with_tail=with_tail, 1577 doctype=doctype) 1578 if method == 'html' and not include_meta_content_type: 1579 if isinstance(html, str): 1580 html = __str_replace_meta_content_type('', html) 1581 else: 1582 html = __bytes_replace_meta_content_type(bytes(), html) 1583 return html

1584 1585 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1586

1587 -def open_in_browser(doc, encoding=None):

1588 """ 1589 Open the HTML document in a web browser, saving it to a temporary 1590 file to open it. Note that this does not delete the file after 1591 use. This is mainly meant for debugging. 1592 """ 1593 import os 1594 import webbrowser 1595 import tempfile 1596 if not isinstance(doc, etree._ElementTree): 1597 doc = etree.ElementTree(doc) 1598 handle, fn = tempfile.mkstemp(suffix='.html') 1599 f = os.fdopen(handle, 'wb') 1600 try: 1601 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1602 finally: 1603 # we leak the file itself here, but we should at least close it 1604 f.close() 1605 url = 'file://' + fn.replace(os.path.sep, '/') 1606 print(url) 1607 webbrowser.open(url)

1608 1609 ################################################################################ 1610 # configure Element class lookup 1611 ################################################################################ 1612

1613 -class HTMLParser(etree.HTMLParser):

1614 """An HTML parser that is configured to return lxml.html Element 1615 objects. 1616 """

1617 - def __init__(self, **kwargs):

1618 super(HTMLParser, self).__init__(**kwargs) 1619 self.set_element_class_lookup(HtmlElementClassLookup())

1620

1621 -class XHTMLParser(etree.XMLParser):

1622 """An XML parser that is configured to return lxml.html Element 1623 objects. 1624 1625 Note that this parser is not really XHTML aware unless you let it 1626 load a DTD that declares the HTML entities. To do this, make sure 1627 you have the XHTML DTDs installed in your catalogs, and create the 1628 parser like this:: 1629 1630 >>> parser = XHTMLParser(load_dtd=True) 1631 1632 If you additionally want to validate the document, use this:: 1633 1634 >>> parser = XHTMLParser(dtd_validation=True) 1635 1636 For catalog support, see http://www.xmlsoft.org/catalog.html. 1637 """

1638 - def __init__(self, **kwargs):

1639 super(XHTMLParser, self).__init__(**kwargs) 1640 self.set_element_class_lookup(HtmlElementClassLookup())

1641

1642 -def Element(*args, **kw):

1643 """Create a new HTML Element. 1644 1645 This can also be used for XHTML documents. 1646 """ 1647 v = html_parser.makeelement(*args, **kw) 1648 return v

1649 1650 html_parser = HTMLParser() 1651 xhtml_parser = XHTMLParser() 1652

Source Code for Package lxml.html