lxml.html

1 # Copyright (c) 2004 Ian Bicking. All rights reserved. 2 # 3 # Redistribution and use in source and binary forms, with or without 4 # modification, are permitted provided that the following conditions are 5 # met: 6 # 7 # 1. Redistributions of source code must retain the above copyright 8 # notice, this list of conditions and the following disclaimer. 9 # 10 # 2. Redistributions in binary form must reproduce the above copyright 11 # notice, this list of conditions and the following disclaimer in 12 # the documentation and/or other materials provided with the 13 # distribution. 14 # 15 # 3. Neither the name of Ian Bicking nor the names of its contributors may 16 # be used to endorse or promote products derived from this software 17 # without specific prior written permission. 18 # 19 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 23 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 """The ``lxml.html`` tool set for HTML handling. 32 """ 33 34 import sys 35 import re 36 try: 37 from urlparse import urljoin 38 except ImportError: 39 # Python 3 40 from urllib.parse import urljoin 41 import copy 42 from lxml import etree 43 from lxml.html import defs 44 from lxml.html._setmixin import SetMixin 45 try: 46 from collections import MutableMapping as DictMixin 47 except ImportError: 48 # Python < 2.6 49 from UserDict import DictMixin 50 try: 51 set 52 except NameError: 53 # Python 2.3 54 from sets import Set as set 55 try: 56 bytes 57 except NameError: 58 # Python < 2.6 59 bytes = str 60 try: 61 unicode 62 except NameError: 63 # Python 3 64 unicode = str 65 try: 66 basestring 67 except NameError: 68 # Python 3 69 basestring = (str, bytes) 70

71 -def __fix_docstring(s):

72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)

80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) 100 _css_import_re = re.compile(r'@import "(.*?)"') 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104

105 -def _unquote_match(s, pos):

106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos

110

111 -def _transform_result(typ, result):

112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding=unicode) 118 else: 119 return result

120

121 -def _nons(tag):

122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag

126

127 -class HtmlMixin(object):

128

129 - def base_url(self):

130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL

137 base_url = property(base_url, doc=base_url.__doc__) 138

139 - def forms(self):

140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)

144 forms = property(forms, doc=forms.__doc__) 145

146 - def body(self):

147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

152 body = property(body, doc=body.__doc__) 153

154 - def head(self):

155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

160 head = property(head, doc=head.__doc__) 161

162 - def _label__get(self):

163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]

174 - def _label__set(self, label):

175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)

185 - def _label__del(self):

186 label = self.label 187 if label is not None: 188 del label.attrib['for']

189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190

191 - def drop_tree(self):

192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)

206

207 - def drop_tag(self):

208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding=unicode)) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]

238

239 - def find_rel_links(self, rel):

240 """ 241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 242 """ 243 rel = rel.lower() 244 return [el for el in _rel_links_xpath(self) 245 if el.get('rel').lower() == rel]

246

247 - def find_class(self, class_name):

248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)

252

253 - def get_element_by_id(self, id, *default):

254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)

273

274 - def text_content(self):

275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)

279

280 - def cssselect(self, expr, translator='html'):

281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)

292 293 ######################################## 294 ## Link functions 295 ######################################## 296

297 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

298 """ 299 Make all links in the document absolute, given the 300 ``base_url`` for the document (the full URL where the document 301 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 302 303 If ``resolve_base_href`` is true, then any ``<base href>`` 304 tags in the document are used *and* removed from the document. 305 If it is false then any such tag is ignored. 306 """ 307 if base_url is None: 308 base_url = self.base_url 309 if base_url is None: 310 raise TypeError( 311 "No base_url given, and the document has no base_url") 312 if resolve_base_href: 313 self.resolve_base_href() 314 def link_repl(href): 315 return urljoin(base_url, href)

316 self.rewrite_links(link_repl)

317

318 - def resolve_base_href(self):

319 """ 320 Find any ``<base href>`` tag in the document, and apply its 321 values to all links found in the document. Also remove the 322 tag once it has been applied. 323 """ 324 base_href = None 325 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 326 for b in basetags: 327 base_href = b.get('href') 328 b.drop_tree() 329 if not base_href: 330 return 331 self.make_links_absolute(base_href, resolve_base_href=False)

332

333 - def iterlinks(self):

334 """ 335 Yield (element, attribute, link, pos), where attribute may be None 336 (indicating the link is in the text). ``pos`` is the position 337 where the link occurs; often 0, but sometimes something else in 338 the case of links in stylesheets or style tags. 339 340 Note: <base href> is *not* taken into account in any way. The 341 link you get is exactly the link in the document. 342 343 Note: multiple links inside of a single text string or 344 attribute value are returned in reversed order. This makes it 345 possible to replace or delete them from the text string value 346 based on their reported text positions. Otherwise, a 347 modification at one text position can change the positions of 348 links reported later on. 349 """ 350 link_attrs = defs.link_attrs 351 for el in self.iter(): 352 attribs = el.attrib 353 tag = _nons(el.tag) 354 if tag != 'object': 355 for attrib in link_attrs: 356 if attrib in attribs: 357 yield (el, attrib, attribs[attrib], 0) 358 elif tag == 'object': 359 codebase = None 360 ## <object> tags have attributes that are relative to 361 ## codebase 362 if 'codebase' in attribs: 363 codebase = el.get('codebase') 364 yield (el, 'codebase', codebase, 0) 365 for attrib in 'classid', 'data': 366 if attrib in attribs: 367 value = el.get(attrib) 368 if codebase is not None: 369 value = urljoin(codebase, value) 370 yield (el, attrib, value, 0) 371 if 'archive' in attribs: 372 for match in _archive_re.finditer(el.get('archive')): 373 value = match.group(0) 374 if codebase is not None: 375 value = urljoin(codebase, value) 376 yield (el, 'archive', value, match.start()) 377 if tag == 'param': 378 valuetype = el.get('valuetype') or '' 379 if valuetype.lower() == 'ref': 380 ## FIXME: while it's fine we *find* this link, 381 ## according to the spec we aren't supposed to 382 ## actually change the value, including resolving 383 ## it. It can also still be a link, even if it 384 ## doesn't have a valuetype="ref" (which seems to be the norm) 385 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 386 yield (el, 'value', el.get('value'), 0) 387 if tag == 'style' and el.text: 388 urls = [ 389 _unquote_match(match.group(1), match.start(1)) 390 for match in _css_url_re.finditer(el.text) 391 ] + [ 392 (match.group(1), match.start(1)) 393 for match in _css_import_re.finditer(el.text) 394 ] 395 if urls: 396 # sort by start pos to bring both match sets back into order 397 urls = [ (start, url) for (url, start) in urls ] 398 urls.sort() 399 # reverse the list to report correct positions despite 400 # modifications 401 urls.reverse() 402 for start, url in urls: 403 yield (el, None, url, start) 404 if 'style' in attribs: 405 urls = list(_css_url_re.finditer(attribs['style'])) 406 if urls: 407 # return in reversed order to simplify in-place modifications 408 for match in urls[::-1]: 409 url, start = _unquote_match(match.group(1), match.start(1)) 410 yield (el, 'style', url, start)

411

412 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 413 base_href=None):

414 """ 415 Rewrite all the links in the document. For each link 416 ``link_repl_func(link)`` will be called, and the return value 417 will replace the old link. 418 419 Note that links may not be absolute (unless you first called 420 ``make_links_absolute()``), and may be internal (e.g., 421 ``'#anchor'``). They can also be values like 422 ``'mailto:email'`` or ``'javascript:expr'``. 423 424 If you give ``base_href`` then all links passed to 425 ``link_repl_func()`` will take that into account. 426 427 If the ``link_repl_func`` returns None, the attribute or 428 tag text will be removed completely. 429 """ 430 if base_href is not None: 431 # FIXME: this can be done in one pass with a wrapper 432 # around link_repl_func 433 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 434 elif resolve_base_href: 435 self.resolve_base_href() 436 for el, attrib, link, pos in self.iterlinks(): 437 new_link = link_repl_func(link.strip()) 438 if new_link == link: 439 continue 440 if new_link is None: 441 # Remove the attribute or element content 442 if attrib is None: 443 el.text = '' 444 else: 445 del el.attrib[attrib] 446 continue 447 if attrib is None: 448 new = el.text[:pos] + new_link + el.text[pos+len(link):] 449 el.text = new 450 else: 451 cur = el.attrib[attrib] 452 if not pos and len(cur) == len(link): 453 # Most common case 454 el.attrib[attrib] = new_link 455 else: 456 new = cur[:pos] + new_link + cur[pos+len(link):] 457 el.attrib[attrib] = new

458 459

460 -class _MethodFunc(object):

461 """ 462 An object that represents a method on an element as a function; 463 the function takes either an element or an HTML string. It 464 returns whatever the function normally returns, or if the function 465 works in-place (and so returns None) it returns a serialized form 466 of the resulting document. 467 """

468 - def __init__(self, name, copy=False, source_class=HtmlMixin):

469 self.name = name 470 self.copy = copy 471 self.__doc__ = getattr(source_class, self.name).__doc__

472 - def __call__(self, doc, *args, **kw):

473 result_type = type(doc) 474 if isinstance(doc, basestring): 475 if 'copy' in kw: 476 raise TypeError( 477 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 478 doc = fromstring(doc, **kw) 479 else: 480 if 'copy' in kw: 481 make_a_copy = kw.pop('copy') 482 else: 483 make_a_copy = self.copy 484 if make_a_copy: 485 doc = copy.deepcopy(doc) 486 meth = getattr(doc, self.name) 487 result = meth(*args, **kw) 488 # FIXME: this None test is a bit sloppy 489 if result is None: 490 # Then return what we got in 491 return _transform_result(result_type, doc) 492 else: 493 return result

494 495 find_rel_links = _MethodFunc('find_rel_links', copy=False) 496 find_class = _MethodFunc('find_class', copy=False) 497 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 498 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 499 iterlinks = _MethodFunc('iterlinks', copy=False) 500 rewrite_links = _MethodFunc('rewrite_links', copy=True) 501

502 -class HtmlComment(etree.CommentBase, HtmlMixin):

503 pass

504

505 -class HtmlElement(etree.ElementBase, HtmlMixin):

506 pass

507

508 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

509 pass

510

511 -class HtmlEntity(etree.EntityBase, HtmlMixin):

512 pass

513 514

515 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

516 """A lookup scheme for HTML Element classes. 517 518 To create a lookup instance with different Element classes, pass a tag 519 name mapping of Element classes in the ``classes`` keyword argument and/or 520 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 521 The special key '*' denotes a Mixin class that should be mixed into all 522 Element classes. 523 """ 524 _default_element_classes = {} 525

526 - def __init__(self, classes=None, mixins=None):

527 etree.CustomElementClassLookup.__init__(self) 528 if classes is None: 529 classes = self._default_element_classes.copy() 530 if mixins: 531 mixers = {} 532 for name, value in mixins: 533 if name == '*': 534 for n in classes.keys(): 535 mixers.setdefault(n, []).append(value) 536 else: 537 mixers.setdefault(name, []).append(value) 538 for name, mix_bases in mixers.items(): 539 cur = classes.get(name, HtmlElement) 540 bases = tuple(mix_bases + [cur]) 541 classes[name] = type(cur.__name__, bases, {}) 542 self._element_classes = classes

543

544 - def lookup(self, node_type, document, namespace, name):

545 if node_type == 'element': 546 return self._element_classes.get(name.lower(), HtmlElement) 547 elif node_type == 'comment': 548 return HtmlComment 549 elif node_type == 'PI': 550 return HtmlProcessingInstruction 551 elif node_type == 'entity': 552 return HtmlEntity 553 # Otherwise normal lookup 554 return None

555 556 ################################################################################ 557 # parsing 558 ################################################################################ 559

560 -def document_fromstring(html, parser=None, **kw):

561 if parser is None: 562 parser = html_parser 563 value = etree.fromstring(html, parser, **kw) 564 if value is None: 565 raise etree.ParserError( 566 "Document is empty") 567 return value

568

569 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 570 parser=None, **kw):

571 """ 572 Parses several HTML elements, returning a list of elements. 573 574 The first item in the list may be a string (though leading 575 whitespace is removed). If no_leading_text is true, then it will 576 be an error if there is leading text, and it will always be a list 577 of only elements. 578 579 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 580 """ 581 if parser is None: 582 parser = html_parser 583 # FIXME: check what happens when you give html with a body, head, etc. 584 start = html[:20].lstrip().lower() 585 if sys.version_info[0] >= 3 and hasattr(start, 'decode'): # Py3 can't mix bytes into startswith() 586 start = start.decode('ISO8859-1') 587 if not start.startswith('<html') and not start.startswith('<!doctype'): 588 html = '<html><body>%s</body></html>' % html 589 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 590 assert _nons(doc.tag) == 'html' 591 bodies = [e for e in doc if _nons(e.tag) == 'body'] 592 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 593 body = bodies[0] 594 elements = [] 595 if no_leading_text and body.text and body.text.strip(): 596 raise etree.ParserError( 597 "There is leading text: %r" % body.text) 598 if body.text and body.text.strip(): 599 elements.append(body.text) 600 elements.extend(body) 601 # FIXME: removing the reference to the parent artificial document 602 # would be nice 603 return elements

604

605 -def fragment_fromstring(html, create_parent=False, base_url=None, 606 parser=None, **kw):

607 """ 608 Parses a single HTML element; it is an error if there is more than 609 one element, or if anything but whitespace precedes or follows the 610 element. 611 612 If create_parent is true (or is a tag name) then a parent node 613 will be created to encapsulate the HTML in a single element. In 614 this case, leading or trailing text is allowed. 615 616 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 617 """ 618 if parser is None: 619 parser = html_parser 620 621 accept_leading_text = bool(create_parent) 622 623 elements = fragments_fromstring( 624 html, parser=parser, no_leading_text=not accept_leading_text, 625 base_url=base_url, **kw) 626 627 if create_parent: 628 if not isinstance(create_parent, basestring): 629 create_parent = 'div' 630 new_root = Element(create_parent) 631 if elements: 632 if isinstance(elements[0], basestring): 633 new_root.text = elements[0] 634 del elements[0] 635 new_root.extend(elements) 636 return new_root 637 638 if not elements: 639 raise etree.ParserError('No elements found') 640 if len(elements) > 1: 641 raise etree.ParserError( 642 "Multiple elements found (%s)" 643 % ', '.join([_element_name(e) for e in elements])) 644 el = elements[0] 645 if el.tail and el.tail.strip(): 646 raise etree.ParserError( 647 "Element followed by text: %r" % el.tail) 648 el.tail = None 649 return el

650

651 -def fromstring(html, base_url=None, parser=None, **kw):

652 """ 653 Parse the html, returning a single element/document. 654 655 This tries to minimally parse the chunk of text, without knowing if it 656 is a fragment or a document. 657 658 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 659 """ 660 if parser is None: 661 parser = html_parser 662 start = html[:10].lstrip().lower() 663 if sys.version_info[0] >= 3 and hasattr(start, 'decode'): # Py3 can't mix bytes into startswith() 664 start = start.decode('ISO8859-1') 665 if start.startswith('<html') or start.startswith('<!doctype'): 666 # Looks like a full HTML document 667 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 668 # otherwise, lets parse it out... 669 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 670 bodies = doc.findall('body') 671 if not bodies: 672 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 673 if bodies: 674 body = bodies[0] 675 if len(bodies) > 1: 676 # Somehow there are multiple bodies, which is bad, but just 677 # smash them into one body 678 for other_body in bodies[1:]: 679 if other_body.text: 680 if len(body): 681 body[-1].tail = (body[-1].tail or '') + other_body.text 682 else: 683 body.text = (body.text or '') + other_body.text 684 body.extend(other_body) 685 # We'll ignore tail 686 # I guess we are ignoring attributes too 687 other_body.drop_tree() 688 else: 689 body = None 690 heads = doc.findall('head') 691 if not heads: 692 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 693 if heads: 694 # Well, we have some sort of structure, so lets keep it all 695 head = heads[0] 696 if len(heads) > 1: 697 for other_head in heads[1:]: 698 head.extend(other_head) 699 # We don't care about text or tail in a head 700 other_head.drop_tree() 701 return doc 702 if (len(body) == 1 and (not body.text or not body.text.strip()) 703 and (not body[-1].tail or not body[-1].tail.strip())): 704 # The body has just one element, so it was probably a single 705 # element passed in 706 return body[0] 707 # Now we have a body which represents a bunch of tags which have the 708 # content that was passed in. We will create a fake container, which 709 # is the body tag, except <body> implies too much structure. 710 if _contains_block_level_tag(body): 711 body.tag = 'div' 712 else: 713 body.tag = 'span' 714 return body

715

716 -def parse(filename_or_url, parser=None, base_url=None, **kw):

717 """ 718 Parse a filename, URL, or file-like object into an HTML document 719 tree. Note: this returns a tree, not an element. Use 720 ``parse(...).getroot()`` to get the document root. 721 722 You can override the base URL with the ``base_url`` keyword. This 723 is most useful when parsing from a file-like object. 724 """ 725 if parser is None: 726 parser = html_parser 727 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

728

729 -def _contains_block_level_tag(el):

730 # FIXME: I could do this with XPath, but would that just be 731 # unnecessarily slow? 732 for el in el.iter(): 733 if _nons(el.tag) in defs.block_tags: 734 return True 735 return False

736

737 -def _element_name(el):

738 if isinstance(el, etree.CommentBase): 739 return 'comment' 740 elif isinstance(el, basestring): 741 return 'string' 742 else: 743 return _nons(el.tag)

744 745 ################################################################################ 746 # form handling 747 ################################################################################ 748

749 -class FormElement(HtmlElement):

750 """ 751 Represents a <form> element. 752 """ 753

754 - def inputs(self):

755 """ 756 Returns an accessor for all the input elements in the form. 757 758 See `InputGetter` for more information about the object. 759 """ 760 return InputGetter(self)

761 inputs = property(inputs, doc=inputs.__doc__) 762

763 - def _fields__get(self):

764 """ 765 Dictionary-like object that represents all the fields in this 766 form. You can set values in this dictionary to effect the 767 form. 768 """ 769 return FieldsDict(self.inputs)

770 - def _fields__set(self, value):

771 prev_keys = self.fields.keys() 772 for key, value in value.items(): 773 if key in prev_keys: 774 prev_keys.remove(key) 775 self.fields[key] = value 776 for key in prev_keys: 777 if key is None: 778 # Case of an unnamed input; these aren't really 779 # expressed in form_values() anyway. 780 continue 781 self.fields[key] = None

782 783 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 784

785 - def _name(self):

786 if self.get('name'): 787 return self.get('name') 788 elif self.get('id'): 789 return '#' + self.get('id') 790 forms = list(self.body.iter('form')) 791 if not forms: 792 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 793 return str(forms.index(self))

794

795 - def form_values(self):

796 """ 797 Return a list of tuples of the field values for the form. 798 This is suitable to be passed to ``urllib.urlencode()``. 799 """ 800 results = [] 801 for el in self.inputs: 802 name = el.name 803 if not name: 804 continue 805 tag = _nons(el.tag) 806 if tag == 'textarea': 807 results.append((name, el.value)) 808 elif tag == 'select': 809 value = el.value 810 if el.multiple: 811 for v in value: 812 results.append((name, v)) 813 elif value is not None: 814 results.append((name, el.value)) 815 else: 816 assert tag == 'input', ( 817 "Unexpected tag: %r" % el) 818 if el.checkable and not el.checked: 819 continue 820 if el.type in ('submit', 'image', 'reset'): 821 continue 822 value = el.value 823 if value is not None: 824 results.append((name, el.value)) 825 return results

826

827 - def _action__get(self):

828 """ 829 Get/set the form's ``action`` attribute. 830 """ 831 base_url = self.base_url 832 action = self.get('action') 833 if base_url and action is not None: 834 return urljoin(base_url, action) 835 else: 836 return action

837 - def _action__set(self, value):

838 self.set('action', value)

839 - def _action__del(self):

840 if 'action' in self.attrib: 841 del self.attrib['action']

842 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 843

844 - def _method__get(self):

845 """ 846 Get/set the form's method. Always returns a capitalized 847 string, and defaults to ``'GET'`` 848 """ 849 return self.get('method', 'GET').upper()

850 - def _method__set(self, value):

851 self.set('method', value.upper())

852 method = property(_method__get, _method__set, doc=_method__get.__doc__)

853 854 HtmlElementClassLookup._default_element_classes['form'] = FormElement 855

856 -def submit_form(form, extra_values=None, open_http=None):

857 """ 858 Helper function to submit a form. Returns a file-like object, as from 859 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 860 which shows the URL if there were any redirects. 861 862 You can use this like:: 863 864 form = doc.forms[0] 865 form.inputs['foo'].value = 'bar' # etc 866 response = form.submit() 867 doc = parse(response) 868 doc.make_links_absolute(response.geturl()) 869 870 To change the HTTP requester, pass a function as ``open_http`` keyword 871 argument that opens the URL for you. The function must have the following 872 signature:: 873 874 open_http(method, URL, values) 875 876 The action is one of 'GET' or 'POST', the URL is the target URL as a 877 string, and the values are a sequence of ``(name, value)`` tuples with the 878 form data. 879 """ 880 values = form.form_values() 881 if extra_values: 882 if hasattr(extra_values, 'items'): 883 extra_values = extra_values.items() 884 values.extend(extra_values) 885 if open_http is None: 886 open_http = open_http_urllib 887 if form.action: 888 url = form.action 889 else: 890 url = form.base_url 891 return open_http(form.method, url, values)

892

893 -def open_http_urllib(method, url, values):

894 if not url: 895 raise ValueError("cannot submit, no URL provided") 896 ## FIXME: should test that it's not a relative URL or something 897 try: 898 from urllib import urlencode, urlopen 899 except ImportError: # Python 3 900 from urllib.request import urlopen 901 from urllib.parse import urlencode 902 if method == 'GET': 903 if '?' in url: 904 url += '&' 905 else: 906 url += '?' 907 url += urlencode(values) 908 data = None 909 else: 910 data = urlencode(values) 911 return urlopen(url, data)

912

913 -class FieldsDict(DictMixin):

914

915 - def __init__(self, inputs):

916 self.inputs = inputs

917 - def __getitem__(self, item):

918 return self.inputs[item].value

919 - def __setitem__(self, item, value):

920 self.inputs[item].value = value

921 - def __delitem__(self, item):

922 raise KeyError( 923 "You cannot remove keys from ElementDict")

924 - def keys(self):

925 return self.inputs.keys()

926 - def __contains__(self, item):

927 return item in self.inputs

928 - def __iter__(self):

929 return iter(self.inputs.keys())

930 - def __len__(self):

931 return len(self.inputs)

932

933 - def __repr__(self):

934 return '<%s for form %s>' % ( 935 self.__class__.__name__, 936 self.inputs.form._name())

937

938 -class InputGetter(object):

939 940 """ 941 An accessor that represents all the input fields in a form. 942 943 You can get fields by name from this, with 944 ``form.inputs['field_name']``. If there are a set of checkboxes 945 with the same name, they are returned as a list (a `CheckboxGroup` 946 which also allows value setting). Radio inputs are handled 947 similarly. 948 949 You can also iterate over this to get all input elements. This 950 won't return the same thing as if you get all the names, as 951 checkboxes and radio elements are returned individually. 952 """ 953 954 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 955 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 956

957 - def __init__(self, form):

958 self.form = form

959

960 - def __repr__(self):

961 return '<%s for form %s>' % ( 962 self.__class__.__name__, 963 self.form._name())

964 965 ## FIXME: there should be more methods, and it's unclear if this is 966 ## a dictionary-like object or list-like object 967

968 - def __getitem__(self, name):

969 results = self._name_xpath(self.form, name=name) 970 if results: 971 type = results[0].get('type') 972 if type == 'radio' and len(results) > 1: 973 group = RadioGroup(results) 974 group.name = name 975 return group 976 elif type == 'checkbox' and len(results) > 1: 977 group = CheckboxGroup(results) 978 group.name = name 979 return group 980 else: 981 # I don't like throwing away elements like this 982 return results[0] 983 else: 984 raise KeyError( 985 "No input element with the name %r" % name)

986

987 - def __contains__(self, name):

988 results = self._name_xpath(self.form, name=name) 989 return bool(results)

990

991 - def keys(self):

992 names = set() 993 for el in self: 994 names.add(el.name) 995 if None in names: 996 names.remove(None) 997 return list(names)

998

999 - def __iter__(self):

1000 ## FIXME: kind of dumb to turn a list into an iterator, only 1001 ## to have it likely turned back into a list again :( 1002 return iter(self._all_xpath(self.form))

1003

1004 -class InputMixin(object):

1005 1006 """ 1007 Mix-in for all input elements (input, select, and textarea) 1008 """ 1009 1010

1011 - def _name__get(self):

1012 """ 1013 Get/set the name of the element 1014 """ 1015 return self.get('name')

1016 - def _name__set(self, value):

1017 self.set('name', value)

1018 - def _name__del(self):

1019 if 'name' in self.attrib: 1020 del self.attrib['name']

1021 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1022

1023 - def __repr__(self):

1024 type = getattr(self, 'type', None) 1025 if type: 1026 type = ' type=%r' % type 1027 else: 1028 type = '' 1029 return '<%s %x name=%r%s>' % ( 1030 self.__class__.__name__, id(self), self.name, type)

1031

1032 -class TextareaElement(InputMixin, HtmlElement):

1033 """ 1034 ``<textarea>`` element. You can get the name with ``.name`` and 1035 get/set the value with ``.value`` 1036 """ 1037

1038 - def _value__get(self):

1039 """ 1040 Get/set the value (which is the contents of this element) 1041 """ 1042 content = self.text or '' 1043 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1044 serialisation_method = 'xml' 1045 else: 1046 serialisation_method = 'html' 1047 for el in self: 1048 # it's rare that we actually get here, so let's not use ''.join() 1049 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1050 return content

1051 - def _value__set(self, value):

1052 del self[:] 1053 self.text = value

1054 - def _value__del(self):

1055 self.text = '' 1056 del self[:]

1057 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

1058 1059 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1060

1061 -class SelectElement(InputMixin, HtmlElement):

1062 """ 1063 ``<select>`` element. You can get the name with ``.name``. 1064 1065 ``.value`` will be the value of the selected option, unless this 1066 is a multi-select element (``<select multiple>``), in which case 1067 it will be a set-like object. In either case ``.value_options`` 1068 gives the possible values. 1069 1070 The boolean attribute ``.multiple`` shows if this is a 1071 multi-select. 1072 """ 1073

1074 - def _value__get(self):

1075 """ 1076 Get/set the value of this select (the selected option). 1077 1078 If this is a multi-select, this is a set-like object that 1079 represents all the selected options. 1080 """ 1081 if self.multiple: 1082 return MultipleSelectOptions(self) 1083 for el in _options_xpath(self): 1084 if el.get('selected') is not None: 1085 value = el.get('value') 1086 if value is None: 1087 value = el.text or '' 1088 if value: 1089 value = value.strip() 1090 return value 1091 return None

1092

1093 - def _value__set(self, value):

1094 if self.multiple: 1095 if isinstance(value, basestring): 1096 raise TypeError( 1097 "You must pass in a sequence") 1098 self.value.clear() 1099 self.value.update(value) 1100 return 1101 if value is not None: 1102 value = value.strip() 1103 for el in _options_xpath(self): 1104 opt_value = el.get('value') 1105 if opt_value is None: 1106 opt_value = el.text or '' 1107 if opt_value: 1108 opt_value = opt_value.strip() 1109 if opt_value == value: 1110 checked_option = el 1111 break 1112 else: 1113 raise ValueError( 1114 "There is no option with the value of %r" % value) 1115 for el in _options_xpath(self): 1116 if 'selected' in el.attrib: 1117 del el.attrib['selected'] 1118 if value is not None: 1119 checked_option.set('selected', '')

1120

1121 - def _value__del(self):

1122 # FIXME: should del be allowed at all? 1123 if self.multiple: 1124 self.value.clear() 1125 else: 1126 self.value = None

1127 1128 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1129

1130 - def value_options(self):

1131 """ 1132 All the possible values this select can have (the ``value`` 1133 attribute of all the ``<option>`` elements. 1134 """ 1135 options = [] 1136 for el in _options_xpath(self): 1137 value = el.get('value') 1138 if value is None: 1139 value = el.text or '' 1140 if value: 1141 value = value.strip() 1142 options.append(value) 1143 return options

1144 value_options = property(value_options, doc=value_options.__doc__) 1145

1146 - def _multiple__get(self):

1147 """ 1148 Boolean attribute: is there a ``multiple`` attribute on this element. 1149 """ 1150 return 'multiple' in self.attrib

1151 - def _multiple__set(self, value):

1152 if value: 1153 self.set('multiple', '') 1154 elif 'multiple' in self.attrib: 1155 del self.attrib['multiple']

1156 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1157 1158 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1159

1160 -class MultipleSelectOptions(SetMixin):

1161 """ 1162 Represents all the selected options in a ``<select multiple>`` element. 1163 1164 You can add to this set-like option to select an option, or remove 1165 to unselect the option. 1166 """ 1167

1168 - def __init__(self, select):

1169 self.select = select

1170

1171 - def options(self):

1172 """ 1173 Iterator of all the ``<option>`` elements. 1174 """ 1175 return iter(_options_xpath(self.select))

1176 options = property(options) 1177

1178 - def __iter__(self):

1179 for option in self.options: 1180 if 'selected' in option.attrib: 1181 opt_value = option.get('value') 1182 if opt_value is None: 1183 opt_value = option.text or '' 1184 if opt_value: 1185 opt_value = opt_value.strip() 1186 yield opt_value

1187

1188 - def add(self, item):

1189 for option in self.options: 1190 opt_value = option.get('value') 1191 if opt_value is None: 1192 opt_value = option.text or '' 1193 if opt_value: 1194 opt_value = opt_value.strip() 1195 if opt_value == item: 1196 option.set('selected', '') 1197 break 1198 else: 1199 raise ValueError( 1200 "There is no option with the value %r" % item)

1201

1202 - def remove(self, item):

1203 for option in self.options: 1204 opt_value = option.get('value') 1205 if opt_value is None: 1206 opt_value = option.text or '' 1207 if opt_value: 1208 opt_value = opt_value.strip() 1209 if opt_value == item: 1210 if 'selected' in option.attrib: 1211 del option.attrib['selected'] 1212 else: 1213 raise ValueError( 1214 "The option %r is not currently selected" % item) 1215 break 1216 else: 1217 raise ValueError( 1218 "There is not option with the value %r" % item)

1219

1220 - def __repr__(self):

1221 return '<%s {%s} for select name=%r>' % ( 1222 self.__class__.__name__, 1223 ', '.join([repr(v) for v in self]), 1224 self.select.name)

1225

1226 -class RadioGroup(list):

1227 """ 1228 This object represents several ``<input type=radio>`` elements 1229 that have the same name. 1230 1231 You can use this like a list, but also use the property 1232 ``.value`` to check/uncheck inputs. Also you can use 1233 ``.value_options`` to get the possible values. 1234 """ 1235

1236 - def _value__get(self):

1237 """ 1238 Get/set the value, which checks the radio with that value (and 1239 unchecks any other value). 1240 """ 1241 for el in self: 1242 if 'checked' in el.attrib: 1243 return el.get('value') 1244 return None

1245

1246 - def _value__set(self, value):

1247 if value is not None: 1248 for el in self: 1249 if el.get('value') == value: 1250 checked_option = el 1251 break 1252 else: 1253 raise ValueError( 1254 "There is no radio input with the value %r" % value) 1255 for el in self: 1256 if 'checked' in el.attrib: 1257 del el.attrib['checked'] 1258 if value is not None: 1259 checked_option.set('checked', '')

1260

1261 - def _value__del(self):

1262 self.value = None

1263 1264 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1265

1266 - def value_options(self):

1267 """ 1268 Returns a list of all the possible values. 1269 """ 1270 return [el.get('value') for el in self]

1271 value_options = property(value_options, doc=value_options.__doc__) 1272

1273 - def __repr__(self):

1274 return '%s(%s)' % ( 1275 self.__class__.__name__, 1276 list.__repr__(self))

1277

1278 -class CheckboxGroup(list):

1279 """ 1280 Represents a group of checkboxes (``<input type=checkbox>``) that 1281 have the same name. 1282 1283 In addition to using this like a list, the ``.value`` attribute 1284 returns a set-like object that you can add to or remove from to 1285 check and uncheck checkboxes. You can also use ``.value_options`` 1286 to get the possible values. 1287 """ 1288

1289 - def _value__get(self):

1290 """ 1291 Return a set-like object that can be modified to check or 1292 uncheck individual checkboxes according to their value. 1293 """ 1294 return CheckboxValues(self)

1295 - def _value__set(self, value):

1296 self.value.clear() 1297 if not hasattr(value, '__iter__'): 1298 raise ValueError( 1299 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1300 % (self[0].name, value)) 1301 self.value.update(value)

1302 - def _value__del(self):

1303 self.value.clear()

1304 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1305

1306 - def value_options(self):

1307 """ 1308 Returns a list of all the possible values. 1309 """ 1310 return [el.get('value') for el in self]

1311 value_options = property(value_options, doc=value_options.__doc__) 1312

1313 - def __repr__(self):

1314 return '%s(%s)' % ( 1315 self.__class__.__name__, list.__repr__(self))

1316

1317 -class CheckboxValues(SetMixin):

1318 1319 """ 1320 Represents the values of the checked checkboxes in a group of 1321 checkboxes with the same name. 1322 """ 1323

1324 - def __init__(self, group):

1325 self.group = group

1326

1327 - def __iter__(self):

1328 return iter([ 1329 el.get('value') 1330 for el in self.group 1331 if 'checked' in el.attrib])

1332

1333 - def add(self, value):

1334 for el in self.group: 1335 if el.get('value') == value: 1336 el.set('checked', '') 1337 break 1338 else: 1339 raise KeyError("No checkbox with value %r" % value)

1340

1341 - def remove(self, value):

1342 for el in self.group: 1343 if el.get('value') == value: 1344 if 'checked' in el.attrib: 1345 del el.attrib['checked'] 1346 else: 1347 raise KeyError( 1348 "The checkbox with value %r was already unchecked" % value) 1349 break 1350 else: 1351 raise KeyError( 1352 "No checkbox with value %r" % value)

1353

1354 - def __repr__(self):

1355 return '<%s {%s} for checkboxes name=%r>' % ( 1356 self.__class__.__name__, 1357 ', '.join([repr(v) for v in self]), 1358 self.group.name)

1359

1360 -class InputElement(InputMixin, HtmlElement):

1361 """ 1362 Represents an ``<input>`` element. 1363 1364 You can get the type with ``.type`` (which is lower-cased and 1365 defaults to ``'text'``). 1366 1367 Also you can get and set the value with ``.value`` 1368 1369 Checkboxes and radios have the attribute ``input.checkable == 1370 True`` (for all others it is false) and a boolean attribute 1371 ``.checked``. 1372 1373 """ 1374 1375 ## FIXME: I'm a little uncomfortable with the use of .checked

1376 - def _value__get(self):

1377 """ 1378 Get/set the value of this element, using the ``value`` attribute. 1379 1380 Also, if this is a checkbox and it has no value, this defaults 1381 to ``'on'``. If it is a checkbox or radio that is not 1382 checked, this returns None. 1383 """ 1384 if self.checkable: 1385 if self.checked: 1386 return self.get('value') or 'on' 1387 else: 1388 return None 1389 return self.get('value')

1390 - def _value__set(self, value):

1391 if self.checkable: 1392 if not value: 1393 self.checked = False 1394 else: 1395 self.checked = True 1396 if isinstance(value, basestring): 1397 self.set('value', value) 1398 else: 1399 self.set('value', value)

1400 - def _value__del(self):

1401 if self.checkable: 1402 self.checked = False 1403 else: 1404 if 'value' in self.attrib: 1405 del self.attrib['value']

1406 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1407

1408 - def _type__get(self):

1409 """ 1410 Return the type of this element (using the type attribute). 1411 """ 1412 return self.get('type', 'text').lower()

1413 - def _type__set(self, value):

1414 self.set('type', value)

1415 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1416

1417 - def checkable(self):

1418 """ 1419 Boolean: can this element be checked? 1420 """ 1421 return self.type in ['checkbox', 'radio']

1422 checkable = property(checkable, doc=checkable.__doc__) 1423

1424 - def _checked__get(self):

1425 """ 1426 Boolean attribute to get/set the presence of the ``checked`` 1427 attribute. 1428 1429 You can only use this on checkable input types. 1430 """ 1431 if not self.checkable: 1432 raise AttributeError('Not a checkable input type') 1433 return 'checked' in self.attrib

1434 - def _checked__set(self, value):

1435 if not self.checkable: 1436 raise AttributeError('Not a checkable input type') 1437 if value: 1438 self.set('checked', '') 1439 else: 1440 if 'checked' in self.attrib: 1441 del self.attrib['checked']

1442 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1443 1444 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1445

1446 -class LabelElement(HtmlElement):

1447 """ 1448 Represents a ``<label>`` element. 1449 1450 Label elements are linked to other elements with their ``for`` 1451 attribute. You can access this element with ``label.for_element``. 1452 """ 1453

1454 - def _for_element__get(self):

1455 """ 1456 Get/set the element this label points to. Return None if it 1457 can't be found. 1458 """ 1459 id = self.get('for') 1460 if not id: 1461 return None 1462 return self.body.get_element_by_id(id)

1463 - def _for_element__set(self, other):

1464 id = other.get('id') 1465 if not id: 1466 raise TypeError( 1467 "Element %r has no id attribute" % other) 1468 self.set('for', id)

1469 - def _for_element__del(self):

1470 if 'id' in self.attrib: 1471 del self.attrib['id']

1472 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1473 doc=_for_element__get.__doc__)

1474 1475 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1476 1477 ############################################################ 1478 ## Serialization 1479 ############################################################ 1480

1481 -def html_to_xhtml(html):

1482 """Convert all tags in an HTML tree to XHTML by moving them to the 1483 XHTML namespace. 1484 """ 1485 try: 1486 html = html.getroot() 1487 except AttributeError: 1488 pass 1489 prefix = "{%s}" % XHTML_NAMESPACE 1490 for el in html.iter(): 1491 tag = el.tag 1492 if isinstance(tag, basestring): 1493 if tag[0] != '{': 1494 el.tag = prefix + tag

1495

1496 -def xhtml_to_html(xhtml):

1497 """Convert all tags in an XHTML tree to HTML by removing their 1498 XHTML namespace. 1499 """ 1500 try: 1501 xhtml = xhtml.getroot() 1502 except AttributeError: 1503 pass 1504 prefix = "{%s}" % XHTML_NAMESPACE 1505 prefix_len = len(prefix) 1506 for el in xhtml.iter(prefix + "*"): 1507 el.tag = el.tag[prefix_len:]

1508 1509 # This isn't a general match, but it's a match for what libxml2 1510 # specifically serialises: 1511 __str_replace_meta_content_type = re.compile( 1512 r'<meta http-equiv="Content-Type"[^>]*>').sub 1513 __bytes_replace_meta_content_type = re.compile( 1514 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1515

1516 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1517 encoding=None, method="html", with_tail=True, doctype=None):

1518 """Return an HTML string representation of the document. 1519 1520 Note: if include_meta_content_type is true this will create a 1521 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1522 regardless of the value of include_meta_content_type any existing 1523 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1524 1525 The ``encoding`` argument controls the output encoding (defauts to 1526 ASCII, with &#...; character references for any characters outside 1527 of ASCII). Note that you can pass the name ``'unicode'`` as 1528 ``encoding`` argument to serialise to a unicode string. 1529 1530 The ``method`` argument defines the output method. It defaults to 1531 'html', but can also be 'xml' for xhtml output, or 'text' to 1532 serialise to plain text without markup. 1533 1534 To leave out the tail text of the top-level element that is being 1535 serialised, pass ``with_tail=False``. 1536 1537 The ``doctype`` option allows passing in a plain string that will 1538 be serialised before the XML tree. Note that passing in non 1539 well-formed content here will make the XML output non well-formed. 1540 Also, an existing doctype in the document tree will not be removed 1541 when serialising an ElementTree instance. 1542 1543 Example:: 1544 1545 >>> from lxml import html 1546 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1547 1548 >>> html.tostring(root) 1549 b'<p>Hello<br>world!</p>' 1550 >>> html.tostring(root, method='html') 1551 b'<p>Hello<br>world!</p>' 1552 1553 >>> html.tostring(root, method='xml') 1554 b'<p>Hello<br/>world!</p>' 1555 1556 >>> html.tostring(root, method='text') 1557 b'Helloworld!' 1558 1559 >>> html.tostring(root, method='text', encoding=unicode) 1560 u'Helloworld!' 1561 1562 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1563 >>> html.tostring(root[0], method='text', encoding=unicode) 1564 u'Helloworld!TAIL' 1565 1566 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 1567 u'Helloworld!' 1568 1569 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1570 >>> html.tostring(doc, method='html', encoding=unicode) 1571 u'<html><body><p>Hello<br>world!</p></body></html>' 1572 1573 >>> print(html.tostring(doc, method='html', encoding=unicode, 1574 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1575 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1576 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1577 <html><body><p>Hello<br>world!</p></body></html> 1578 """ 1579 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1580 encoding=encoding, with_tail=with_tail, 1581 doctype=doctype) 1582 if method == 'html' and not include_meta_content_type: 1583 if isinstance(html, str): 1584 html = __str_replace_meta_content_type('', html) 1585 else: 1586 html = __bytes_replace_meta_content_type(bytes(), html) 1587 return html

1588 1589 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1590

1591 -def open_in_browser(doc, encoding=None):

1592 """ 1593 Open the HTML document in a web browser, saving it to a temporary 1594 file to open it. Note that this does not delete the file after 1595 use. This is mainly meant for debugging. 1596 """ 1597 import os 1598 import webbrowser 1599 import tempfile 1600 if not isinstance(doc, etree._ElementTree): 1601 doc = etree.ElementTree(doc) 1602 handle, fn = tempfile.mkstemp(suffix='.html') 1603 f = os.fdopen(handle, 'wb') 1604 try: 1605 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1606 finally: 1607 # we leak the file itself here, but we should at least close it 1608 f.close() 1609 url = 'file://' + fn.replace(os.path.sep, '/') 1610 print(url) 1611 webbrowser.open(url)

1612 1613 ################################################################################ 1614 # configure Element class lookup 1615 ################################################################################ 1616

1617 -class HTMLParser(etree.HTMLParser):

1618 """An HTML parser that is configured to return lxml.html Element 1619 objects. 1620 """

1621 - def __init__(self, **kwargs):

1622 super(HTMLParser, self).__init__(**kwargs) 1623 self.set_element_class_lookup(HtmlElementClassLookup())

1624

1625 -class XHTMLParser(etree.XMLParser):

1626 """An XML parser that is configured to return lxml.html Element 1627 objects. 1628 1629 Note that this parser is not really XHTML aware unless you let it 1630 load a DTD that declares the HTML entities. To do this, make sure 1631 you have the XHTML DTDs installed in your catalogs, and create the 1632 parser like this:: 1633 1634 >>> parser = XHTMLParser(load_dtd=True) 1635 1636 If you additionally want to validate the document, use this:: 1637 1638 >>> parser = XHTMLParser(dtd_validation=True) 1639 1640 For catalog support, see http://www.xmlsoft.org/catalog.html. 1641 """

1642 - def __init__(self, **kwargs):

1643 super(XHTMLParser, self).__init__(**kwargs) 1644 self.set_element_class_lookup(HtmlElementClassLookup())

1645

1646 -def Element(*args, **kw):

1647 """Create a new HTML Element. 1648 1649 This can also be used for XHTML documents. 1650 """ 1651 v = html_parser.makeelement(*args, **kw) 1652 return v

1653 1654 html_parser = HTMLParser() 1655 xhtml_parser = XHTMLParser() 1656

Source Code for Package lxml.html