lxml.html

1 """The ``lxml.html`` tool set for HTML handling. 2 """ 3 4 import threading 5 import re 6 try: 7 from urlparse import urljoin 8 except ImportError: 9 # Python 3 10 from urllib.parse import urljoin 11 import copy 12 from lxml import etree 13 from lxml.html import defs 14 from lxml import cssselect 15 from lxml.html._setmixin import SetMixin 16 try: 17 from UserDict import DictMixin 18 except ImportError: 19 # DictMixin was introduced in Python 2.4 20 from lxml.html._dictmixin import DictMixin 21 try: 22 set 23 except NameError: 24 # Python 2.3 25 from sets import Set as set 26 try: 27 bytes = __builtins__["bytes"] 28 except (KeyError, NameError): 29 # Python < 2.6 30 bytes = str 31 try: 32 unicode = __builtins__["unicode"] 33 except (KeyError, NameError): 34 # Python 3 35 unicode = str 36 try: 37 basestring = __builtins__["basestring"] 38 except (KeyError, NameError): 39 # Python 3 40 basestring = (str, bytes) 41

42 -def __fix_docstring(s):

43 if not s: 44 return s 45 import sys 46 if sys.version_info[0] >= 3: 47 sub = re.compile(r"^(\s*)u'", re.M).sub 48 else: 49 sub = re.compile(r"^(\s*)b'", re.M).sub 50 return sub(r"\1'", s)

51 52 __all__ = [ 53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 55 'find_rel_links', 'find_class', 'make_links_absolute', 56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 57 58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 59 60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 63 namespaces={'x':XHTML_NAMESPACE}) 64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 65 namespaces={'x':XHTML_NAMESPACE}) 66 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 69 _collect_string_content = etree.XPath("string()") 70 _css_url_re = re.compile(r'url$('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)$', re.I) 71 _css_import_re = re.compile(r'@import "(.*?)"') 72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 73 namespaces={'x':XHTML_NAMESPACE}) 74 _archive_re = re.compile(r'[^ ]+') 75

76 -def _unquote_match(s, pos):

77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 78 return s[1:-1], pos+1 79 else: 80 return s,pos

81

82 -def _transform_result(typ, result):

83 """Convert the result back into the input type. 84 """ 85 if issubclass(typ, bytes): 86 return tostring(result, encoding='utf-8') 87 elif issubclass(typ, unicode): 88 return tostring(result, encoding=unicode) 89 else: 90 return result

91

92 -def _nons(tag):

93 if isinstance(tag, basestring): 94 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 95 return tag.split('}')[-1] 96 return tag

97

98 -class HtmlMixin(object):

99

100 - def base_url(self):

101 """ 102 Returns the base URL, given when the page was parsed. 103 104 Use with ``urlparse.urljoin(el.base_url, href)`` to get 105 absolute URLs. 106 """ 107 return self.getroottree().docinfo.URL

108 base_url = property(base_url, doc=base_url.__doc__) 109

110 - def forms(self):

111 """ 112 Return a list of all the forms 113 """ 114 return _forms_xpath(self)

115 forms = property(forms, doc=forms.__doc__) 116

117 - def body(self):

118 """ 119 Return the <body> element. Can be called from a child element 120 to get the document's head. 121 """ 122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

123 body = property(body, doc=body.__doc__) 124

125 - def head(self):

126 """ 127 Returns the <head> element. Can be called from a child 128 element to get the document's head. 129 """ 130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

131 head = property(head, doc=head.__doc__) 132

133 - def _label__get(self):

134 """ 135 Get or set any <label> element associated with this element. 136 """ 137 id = self.get('id') 138 if not id: 139 return None 140 result = _label_xpath(self, id=id) 141 if not result: 142 return None 143 else: 144 return result[0]

145 - def _label__set(self, label):

146 id = self.get('id') 147 if not id: 148 raise TypeError( 149 "You cannot set a label for an element (%r) that has no id" 150 % self) 151 if _nons(label.tag) != 'label': 152 raise TypeError( 153 "You can only assign label to a label element (not %r)" 154 % label) 155 label.set('for', id)

156 - def _label__del(self):

157 label = self.label 158 if label is not None: 159 del label.attrib['for']

160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 161

162 - def drop_tree(self):

163 """ 164 Removes this element from the tree, including its children and 165 text. The tail text is joined to the previous element or 166 parent. 167 """ 168 parent = self.getparent() 169 assert parent is not None 170 if self.tail: 171 previous = self.getprevious() 172 if previous is None: 173 parent.text = (parent.text or '') + self.tail 174 else: 175 previous.tail = (previous.tail or '') + self.tail 176 parent.remove(self)

177

178 - def drop_tag(self):

179 """ 180 Remove the tag, but not its children or text. The children and text 181 are merged into the parent. 182 183 Example:: 184 185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 186 >>> h.find('.//b').drop_tag() 187 >>> print(tostring(h, encoding=unicode)) 188 <div>Hello World!</div> 189 """ 190 parent = self.getparent() 191 assert parent is not None 192 previous = self.getprevious() 193 if self.text and isinstance(self.tag, basestring): 194 # not a Comment, etc. 195 if previous is None: 196 parent.text = (parent.text or '') + self.text 197 else: 198 previous.tail = (previous.tail or '') + self.text 199 if self.tail: 200 if len(self): 201 last = self[-1] 202 last.tail = (last.tail or '') + self.tail 203 elif previous is None: 204 parent.text = (parent.text or '') + self.tail 205 else: 206 previous.tail = (previous.tail or '') + self.tail 207 index = parent.index(self) 208 parent[index:index+1] = self[:]

209

210 - def find_rel_links(self, rel):

211 """ 212 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 213 """ 214 rel = rel.lower() 215 return [el for el in _rel_links_xpath(self) 216 if el.get('rel').lower() == rel]

217

218 - def find_class(self, class_name):

219 """ 220 Find any elements with the given class name. 221 """ 222 return _class_xpath(self, class_name=class_name)

223

224 - def get_element_by_id(self, id, *default):

225 """ 226 Get the first element in a document with the given id. If none is 227 found, return the default argument if provided or raise KeyError 228 otherwise. 229 230 Note that there can be more than one element with the same id, 231 and this isn't uncommon in HTML documents found in the wild. 232 Browsers return only the first match, and this function does 233 the same. 234 """ 235 try: 236 # FIXME: should this check for multiple matches? 237 # browsers just return the first one 238 return _id_xpath(self, id=id)[0] 239 except IndexError: 240 if default: 241 return default[0] 242 else: 243 raise KeyError(id)

244

245 - def text_content(self):

246 """ 247 Return the text content of the tag (and the text in any children). 248 """ 249 return _collect_string_content(self)

250

251 - def cssselect(self, expr):

252 """ 253 Run the CSS expression on this element and its children, 254 returning a list of the results. 255 256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 257 that pre-compiling the expression can provide a substantial 258 speedup. 259 """ 260 return cssselect.CSSSelector(expr)(self)

261 262 ######################################## 263 ## Link functions 264 ######################################## 265

266 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

267 """ 268 Make all links in the document absolute, given the 269 ``base_url`` for the document (the full URL where the document 270 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 271 272 If ``resolve_base_href`` is true, then any ``<base href>`` 273 tags in the document are used *and* removed from the document. 274 If it is false then any such tag is ignored. 275 """ 276 if base_url is None: 277 base_url = self.base_url 278 if base_url is None: 279 raise TypeError( 280 "No base_url given, and the document has no base_url") 281 if resolve_base_href: 282 self.resolve_base_href() 283 def link_repl(href): 284 return urljoin(base_url, href)

285 self.rewrite_links(link_repl)

286

287 - def resolve_base_href(self):

288 """ 289 Find any ``<base href>`` tag in the document, and apply its 290 values to all links found in the document. Also remove the 291 tag once it has been applied. 292 """ 293 base_href = None 294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 295 for b in basetags: 296 base_href = b.get('href') 297 b.drop_tree() 298 if not base_href: 299 return 300 self.make_links_absolute(base_href, resolve_base_href=False)

301

302 - def iterlinks(self):

303 """ 304 Yield (element, attribute, link, pos), where attribute may be None 305 (indicating the link is in the text). ``pos`` is the position 306 where the link occurs; often 0, but sometimes something else in 307 the case of links in stylesheets or style tags. 308 309 Note: <base href> is *not* taken into account in any way. The 310 link you get is exactly the link in the document. 311 312 Note: multiple links inside of a single text string or 313 attribute value are returned in reversed order. This makes it 314 possible to replace or delete them from the text string value 315 based on their reported text positions. Otherwise, a 316 modification at one text position can change the positions of 317 links reported later on. 318 """ 319 link_attrs = defs.link_attrs 320 for el in self.iter(): 321 attribs = el.attrib 322 tag = _nons(el.tag) 323 if tag != 'object': 324 for attrib in link_attrs: 325 if attrib in attribs: 326 yield (el, attrib, attribs[attrib], 0) 327 elif tag == 'object': 328 codebase = None 329 ## <object> tags have attributes that are relative to 330 ## codebase 331 if 'codebase' in attribs: 332 codebase = el.get('codebase') 333 yield (el, 'codebase', codebase, 0) 334 for attrib in 'classid', 'data': 335 if attrib in attribs: 336 value = el.get(attrib) 337 if codebase is not None: 338 value = urljoin(codebase, value) 339 yield (el, attrib, value, 0) 340 if 'archive' in attribs: 341 for match in _archive_re.finditer(el.get('archive')): 342 value = match.group(0) 343 if codebase is not None: 344 value = urljoin(codebase, value) 345 yield (el, 'archive', value, match.start()) 346 if tag == 'param': 347 valuetype = el.get('valuetype') or '' 348 if valuetype.lower() == 'ref': 349 ## FIXME: while it's fine we *find* this link, 350 ## according to the spec we aren't supposed to 351 ## actually change the value, including resolving 352 ## it. It can also still be a link, even if it 353 ## doesn't have a valuetype="ref" (which seems to be the norm) 354 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 355 yield (el, 'value', el.get('value'), 0) 356 if tag == 'style' and el.text: 357 urls = [ 358 _unquote_match(match.group(1), match.start(1)) 359 for match in _css_url_re.finditer(el.text) 360 ] + [ 361 (match.group(1), match.start(1)) 362 for match in _css_import_re.finditer(el.text) 363 ] 364 if urls: 365 # sort by start pos to bring both match sets back into order 366 urls = [ (start, url) for (url, start) in urls ] 367 urls.sort() 368 # reverse the list to report correct positions despite 369 # modifications 370 urls.reverse() 371 for start, url in urls: 372 yield (el, None, url, start) 373 if 'style' in attribs: 374 urls = list(_css_url_re.finditer(attribs['style'])) 375 if urls: 376 # return in reversed order to simplify in-place modifications 377 for match in urls[::-1]: 378 url, start = _unquote_match(match.group(1), match.start(1)) 379 yield (el, 'style', url, start)

380

381 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 382 base_href=None):

383 """ 384 Rewrite all the links in the document. For each link 385 ``link_repl_func(link)`` will be called, and the return value 386 will replace the old link. 387 388 Note that links may not be absolute (unless you first called 389 ``make_links_absolute()``), and may be internal (e.g., 390 ``'#anchor'``). They can also be values like 391 ``'mailto:email'`` or ``'javascript:expr'``. 392 393 If you give ``base_href`` then all links passed to 394 ``link_repl_func()`` will take that into account. 395 396 If the ``link_repl_func`` returns None, the attribute or 397 tag text will be removed completely. 398 """ 399 if base_href is not None: 400 # FIXME: this can be done in one pass with a wrapper 401 # around link_repl_func 402 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 403 elif resolve_base_href: 404 self.resolve_base_href() 405 for el, attrib, link, pos in self.iterlinks(): 406 new_link = link_repl_func(link.strip()) 407 if new_link == link: 408 continue 409 if new_link is None: 410 # Remove the attribute or element content 411 if attrib is None: 412 el.text = '' 413 else: 414 del el.attrib[attrib] 415 continue 416 if attrib is None: 417 new = el.text[:pos] + new_link + el.text[pos+len(link):] 418 el.text = new 419 else: 420 cur = el.attrib[attrib] 421 if not pos and len(cur) == len(link): 422 # Most common case 423 el.attrib[attrib] = new_link 424 else: 425 new = cur[:pos] + new_link + cur[pos+len(link):] 426 el.attrib[attrib] = new

427 428

429 -class _MethodFunc(object):

430 """ 431 An object that represents a method on an element as a function; 432 the function takes either an element or an HTML string. It 433 returns whatever the function normally returns, or if the function 434 works in-place (and so returns None) it returns a serialized form 435 of the resulting document. 436 """

437 - def __init__(self, name, copy=False, source_class=HtmlMixin):

438 self.name = name 439 self.copy = copy 440 self.__doc__ = getattr(source_class, self.name).__doc__

441 - def __call__(self, doc, *args, **kw):

442 result_type = type(doc) 443 if isinstance(doc, basestring): 444 if 'copy' in kw: 445 raise TypeError( 446 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 447 doc = fromstring(doc, **kw) 448 else: 449 if 'copy' in kw: 450 make_a_copy = kw.pop('copy') 451 else: 452 make_a_copy = self.copy 453 if make_a_copy: 454 doc = copy.deepcopy(doc) 455 meth = getattr(doc, self.name) 456 result = meth(*args, **kw) 457 # FIXME: this None test is a bit sloppy 458 if result is None: 459 # Then return what we got in 460 return _transform_result(result_type, doc) 461 else: 462 return result

463 464 find_rel_links = _MethodFunc('find_rel_links', copy=False) 465 find_class = _MethodFunc('find_class', copy=False) 466 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 467 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 468 iterlinks = _MethodFunc('iterlinks', copy=False) 469 rewrite_links = _MethodFunc('rewrite_links', copy=True) 470

471 -class HtmlComment(etree.CommentBase, HtmlMixin):

472 pass

473

474 -class HtmlElement(etree.ElementBase, HtmlMixin):

475 pass

476

477 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

478 pass

479

480 -class HtmlEntity(etree.EntityBase, HtmlMixin):

481 pass

482 483

484 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

485 """A lookup scheme for HTML Element classes. 486 487 To create a lookup instance with different Element classes, pass a tag 488 name mapping of Element classes in the ``classes`` keyword argument and/or 489 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 490 The special key '*' denotes a Mixin class that should be mixed into all 491 Element classes. 492 """ 493 _default_element_classes = {} 494

495 - def __init__(self, classes=None, mixins=None):

496 etree.CustomElementClassLookup.__init__(self) 497 if classes is None: 498 classes = self._default_element_classes.copy() 499 if mixins: 500 mixers = {} 501 for name, value in mixins: 502 if name == '*': 503 for n in classes.keys(): 504 mixers.setdefault(n, []).append(value) 505 else: 506 mixers.setdefault(name, []).append(value) 507 for name, mix_bases in mixers.items(): 508 cur = classes.get(name, HtmlElement) 509 bases = tuple(mix_bases + [cur]) 510 classes[name] = type(cur.__name__, bases, {}) 511 self._element_classes = classes

512

513 - def lookup(self, node_type, document, namespace, name):

514 if node_type == 'element': 515 return self._element_classes.get(name.lower(), HtmlElement) 516 elif node_type == 'comment': 517 return HtmlComment 518 elif node_type == 'PI': 519 return HtmlProcessingInstruction 520 elif node_type == 'entity': 521 return HtmlEntity 522 # Otherwise normal lookup 523 return None

524 525 ################################################################################ 526 # parsing 527 ################################################################################ 528

529 -def document_fromstring(html, parser=None, **kw):

530 if parser is None: 531 parser = html_parser 532 value = etree.fromstring(html, parser, **kw) 533 if value is None: 534 raise etree.ParserError( 535 "Document is empty") 536 return value

537

538 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 539 parser=None, **kw):

540 """ 541 Parses several HTML elements, returning a list of elements. 542 543 The first item in the list may be a string (though leading 544 whitespace is removed). If no_leading_text is true, then it will 545 be an error if there is leading text, and it will always be a list 546 of only elements. 547 548 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 549 """ 550 if parser is None: 551 parser = html_parser 552 # FIXME: check what happens when you give html with a body, head, etc. 553 start = html[:20].lstrip().lower() 554 if not start.startswith('<html') and not start.startswith('<!doctype'): 555 html = '<html><body>%s</body></html>' % html 556 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 557 assert _nons(doc.tag) == 'html' 558 bodies = [e for e in doc if _nons(e.tag) == 'body'] 559 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 560 body = bodies[0] 561 elements = [] 562 if no_leading_text and body.text and body.text.strip(): 563 raise etree.ParserError( 564 "There is leading text: %r" % body.text) 565 if body.text and body.text.strip(): 566 elements.append(body.text) 567 elements.extend(body) 568 # FIXME: removing the reference to the parent artificial document 569 # would be nice 570 return elements

571

572 -def fragment_fromstring(html, create_parent=False, base_url=None, 573 parser=None, **kw):

574 """ 575 Parses a single HTML element; it is an error if there is more than 576 one element, or if anything but whitespace precedes or follows the 577 element. 578 579 If create_parent is true (or is a tag name) then a parent node 580 will be created to encapsulate the HTML in a single element. In 581 this case, leading or trailing text is allowed. 582 583 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 584 """ 585 if parser is None: 586 parser = html_parser 587 588 accept_leading_text = bool(create_parent) 589 590 elements = fragments_fromstring( 591 html, parser=parser, no_leading_text=not accept_leading_text, 592 base_url=base_url, **kw) 593 594 if create_parent: 595 if not isinstance(create_parent, basestring): 596 create_parent = 'div' 597 new_root = Element(create_parent) 598 if elements: 599 if isinstance(elements[0], basestring): 600 new_root.text = elements[0] 601 del elements[0] 602 new_root.extend(elements) 603 return new_root 604 605 if not elements: 606 raise etree.ParserError('No elements found') 607 if len(elements) > 1: 608 raise etree.ParserError( 609 "Multiple elements found (%s)" 610 % ', '.join([_element_name(e) for e in elements])) 611 el = elements[0] 612 if el.tail and el.tail.strip(): 613 raise etree.ParserError( 614 "Element followed by text: %r" % el.tail) 615 el.tail = None 616 return el

617

618 -def fromstring(html, base_url=None, parser=None, **kw):

619 """ 620 Parse the html, returning a single element/document. 621 622 This tries to minimally parse the chunk of text, without knowing if it 623 is a fragment or a document. 624 625 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 626 """ 627 if parser is None: 628 parser = html_parser 629 start = html[:10].lstrip().lower() 630 if start.startswith('<html') or start.startswith('<!doctype'): 631 # Looks like a full HTML document 632 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 633 # otherwise, lets parse it out... 634 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 635 bodies = doc.findall('body') 636 if not bodies: 637 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 638 if bodies: 639 body = bodies[0] 640 if len(bodies) > 1: 641 # Somehow there are multiple bodies, which is bad, but just 642 # smash them into one body 643 for other_body in bodies[1:]: 644 if other_body.text: 645 if len(body): 646 body[-1].tail = (body[-1].tail or '') + other_body.text 647 else: 648 body.text = (body.text or '') + other_body.text 649 body.extend(other_body) 650 # We'll ignore tail 651 # I guess we are ignoring attributes too 652 other_body.drop_tree() 653 else: 654 body = None 655 heads = doc.findall('head') 656 if not heads: 657 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 658 if heads: 659 # Well, we have some sort of structure, so lets keep it all 660 head = heads[0] 661 if len(heads) > 1: 662 for other_head in heads[1:]: 663 head.extend(other_head) 664 # We don't care about text or tail in a head 665 other_head.drop_tree() 666 return doc 667 if (len(body) == 1 and (not body.text or not body.text.strip()) 668 and (not body[-1].tail or not body[-1].tail.strip())): 669 # The body has just one element, so it was probably a single 670 # element passed in 671 return body[0] 672 # Now we have a body which represents a bunch of tags which have the 673 # content that was passed in. We will create a fake container, which 674 # is the body tag, except <body> implies too much structure. 675 if _contains_block_level_tag(body): 676 body.tag = 'div' 677 else: 678 body.tag = 'span' 679 return body

680

681 -def parse(filename_or_url, parser=None, base_url=None, **kw):

682 """ 683 Parse a filename, URL, or file-like object into an HTML document 684 tree. Note: this returns a tree, not an element. Use 685 ``parse(...).getroot()`` to get the document root. 686 687 You can override the base URL with the ``base_url`` keyword. This 688 is most useful when parsing from a file-like object. 689 """ 690 if parser is None: 691 parser = html_parser 692 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

693

694 -def _contains_block_level_tag(el):

695 # FIXME: I could do this with XPath, but would that just be 696 # unnecessarily slow? 697 for el in el.iter(): 698 if _nons(el.tag) in defs.block_tags: 699 return True 700 return False

701

702 -def _element_name(el):

703 if isinstance(el, etree.CommentBase): 704 return 'comment' 705 elif isinstance(el, basestring): 706 return 'string' 707 else: 708 return _nons(el.tag)

709 710 ################################################################################ 711 # form handling 712 ################################################################################ 713

714 -class FormElement(HtmlElement):

715 """ 716 Represents a <form> element. 717 """ 718

719 - def inputs(self):

720 """ 721 Returns an accessor for all the input elements in the form. 722 723 See `InputGetter` for more information about the object. 724 """ 725 return InputGetter(self)

726 inputs = property(inputs, doc=inputs.__doc__) 727

728 - def _fields__get(self):

729 """ 730 Dictionary-like object that represents all the fields in this 731 form. You can set values in this dictionary to effect the 732 form. 733 """ 734 return FieldsDict(self.inputs)

735 - def _fields__set(self, value):

736 prev_keys = self.fields.keys() 737 for key, value in value.iteritems(): 738 if key in prev_keys: 739 prev_keys.remove(key) 740 self.fields[key] = value 741 for key in prev_keys: 742 if key is None: 743 # Case of an unnamed input; these aren't really 744 # expressed in form_values() anyway. 745 continue 746 self.fields[key] = None

747 748 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 749

750 - def _name(self):

751 if self.get('name'): 752 return self.get('name') 753 elif self.get('id'): 754 return '#' + self.get('id') 755 forms = list(self.body.iter('form')) 756 if not forms: 757 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 758 return str(forms.index(self))

759

760 - def form_values(self):

761 """ 762 Return a list of tuples of the field values for the form. 763 This is suitable to be passed to ``urllib.urlencode()``. 764 """ 765 results = [] 766 for el in self.inputs: 767 name = el.name 768 if not name: 769 continue 770 tag = _nons(el.tag) 771 if tag == 'textarea': 772 results.append((name, el.value)) 773 elif tag == 'select': 774 value = el.value 775 if el.multiple: 776 for v in value: 777 results.append((name, v)) 778 elif value is not None: 779 results.append((name, el.value)) 780 else: 781 assert tag == 'input', ( 782 "Unexpected tag: %r" % el) 783 if el.checkable and not el.checked: 784 continue 785 if el.type in ('submit', 'image', 'reset'): 786 continue 787 value = el.value 788 if value is not None: 789 results.append((name, el.value)) 790 return results

791

792 - def _action__get(self):

793 """ 794 Get/set the form's ``action`` attribute. 795 """ 796 base_url = self.base_url 797 action = self.get('action') 798 if base_url and action is not None: 799 return urljoin(base_url, action) 800 else: 801 return action

802 - def _action__set(self, value):

803 self.set('action', value)

804 - def _action__del(self):

805 if 'action' in self.attrib: 806 del self.attrib['action']

807 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 808

809 - def _method__get(self):

810 """ 811 Get/set the form's method. Always returns a capitalized 812 string, and defaults to ``'GET'`` 813 """ 814 return self.get('method', 'GET').upper()

815 - def _method__set(self, value):

816 self.set('method', value.upper())

817 method = property(_method__get, _method__set, doc=_method__get.__doc__)

818 819 HtmlElementClassLookup._default_element_classes['form'] = FormElement 820

821 -def submit_form(form, extra_values=None, open_http=None):

822 """ 823 Helper function to submit a form. Returns a file-like object, as from 824 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 825 which shows the URL if there were any redirects. 826 827 You can use this like:: 828 829 form = doc.forms[0] 830 form.inputs['foo'].value = 'bar' # etc 831 response = form.submit() 832 doc = parse(response) 833 doc.make_links_absolute(response.geturl()) 834 835 To change the HTTP requester, pass a function as ``open_http`` keyword 836 argument that opens the URL for you. The function must have the following 837 signature:: 838 839 open_http(method, URL, values) 840 841 The action is one of 'GET' or 'POST', the URL is the target URL as a 842 string, and the values are a sequence of ``(name, value)`` tuples with the 843 form data. 844 """ 845 values = form.form_values() 846 if extra_values: 847 if hasattr(extra_values, 'items'): 848 extra_values = extra_values.items() 849 values.extend(extra_values) 850 if open_http is None: 851 open_http = open_http_urllib 852 if form.action: 853 url = form.action 854 else: 855 url = form.base_url 856 return open_http(form.method, url, values)

857

858 -def open_http_urllib(method, url, values):

859 if not url: 860 raise ValueError("cannot submit, no URL provided") 861 ## FIXME: should test that it's not a relative URL or something 862 try: 863 from urllib import urlencode, urlopen 864 except ImportError: # Python 3 865 from urllib.request import urlopen 866 from urllib.parse import urlencode 867 if method == 'GET': 868 if '?' in url: 869 url += '&' 870 else: 871 url += '?' 872 url += urlencode(values) 873 data = None 874 else: 875 data = urlencode(values) 876 return urlopen(url, data)

877

878 -class FieldsDict(DictMixin):

879

880 - def __init__(self, inputs):

881 self.inputs = inputs

882 - def __getitem__(self, item):

883 return self.inputs[item].value

884 - def __setitem__(self, item, value):

885 self.inputs[item].value = value

886 - def __delitem__(self, item):

887 raise KeyError( 888 "You cannot remove keys from ElementDict")

889 - def keys(self):

890 return self.inputs.keys()

891 - def __contains__(self, item):

892 return item in self.inputs

893

894 - def __repr__(self):

895 return '<%s for form %s>' % ( 896 self.__class__.__name__, 897 self.inputs.form._name())

898

899 -class InputGetter(object):

900 901 """ 902 An accessor that represents all the input fields in a form. 903 904 You can get fields by name from this, with 905 ``form.inputs['field_name']``. If there are a set of checkboxes 906 with the same name, they are returned as a list (a `CheckboxGroup` 907 which also allows value setting). Radio inputs are handled 908 similarly. 909 910 You can also iterate over this to get all input elements. This 911 won't return the same thing as if you get all the names, as 912 checkboxes and radio elements are returned individually. 913 """ 914 915 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 916 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 917

918 - def __init__(self, form):

919 self.form = form

920

921 - def __repr__(self):

922 return '<%s for form %s>' % ( 923 self.__class__.__name__, 924 self.form._name())

925 926 ## FIXME: there should be more methods, and it's unclear if this is 927 ## a dictionary-like object or list-like object 928

929 - def __getitem__(self, name):

930 results = self._name_xpath(self.form, name=name) 931 if results: 932 type = results[0].get('type') 933 if type == 'radio' and len(results) > 1: 934 group = RadioGroup(results) 935 group.name = name 936 return group 937 elif type == 'checkbox' and len(results) > 1: 938 group = CheckboxGroup(results) 939 group.name = name 940 return group 941 else: 942 # I don't like throwing away elements like this 943 return results[0] 944 else: 945 raise KeyError( 946 "No input element with the name %r" % name)

947

948 - def __contains__(self, name):

949 results = self._name_xpath(self.form, name=name) 950 return bool(results)

951

952 - def keys(self):

953 names = set() 954 for el in self: 955 names.add(el.name) 956 if None in names: 957 names.remove(None) 958 return list(names)

959

960 - def __iter__(self):

961 ## FIXME: kind of dumb to turn a list into an iterator, only 962 ## to have it likely turned back into a list again :( 963 return iter(self._all_xpath(self.form))

964

965 -class InputMixin(object):

966 967 """ 968 Mix-in for all input elements (input, select, and textarea) 969 """ 970 971

972 - def _name__get(self):

973 """ 974 Get/set the name of the element 975 """ 976 return self.get('name')

977 - def _name__set(self, value):

978 self.set('name', value)

979 - def _name__del(self):

980 if 'name' in self.attrib: 981 del self.attrib['name']

982 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 983

984 - def __repr__(self):

985 type = getattr(self, 'type', None) 986 if type: 987 type = ' type=%r' % type 988 else: 989 type = '' 990 return '<%s %x name=%r%s>' % ( 991 self.__class__.__name__, id(self), self.name, type)

992

993 -class TextareaElement(InputMixin, HtmlElement):

994 """ 995 ``<textarea>`` element. You can get the name with ``.name`` and 996 get/set the value with ``.value`` 997 """ 998

999 - def _value__get(self):

1000 """ 1001 Get/set the value (which is the contents of this element) 1002 """ 1003 content = self.text or '' 1004 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1005 serialisation_method = 'xml' 1006 else: 1007 serialisation_method = 'html' 1008 for el in self: 1009 # it's rare that we actually get here, so let's not use ''.join() 1010 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1011 return content

1012 - def _value__set(self, value):

1013 del self[:] 1014 self.text = value

1015 - def _value__del(self):

1016 self.text = '' 1017 del self[:]

1018 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

1019 1020 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1021

1022 -class SelectElement(InputMixin, HtmlElement):

1023 """ 1024 ``<select>`` element. You can get the name with ``.name``. 1025 1026 ``.value`` will be the value of the selected option, unless this 1027 is a multi-select element (``<select multiple>``), in which case 1028 it will be a set-like object. In either case ``.value_options`` 1029 gives the possible values. 1030 1031 The boolean attribute ``.multiple`` shows if this is a 1032 multi-select. 1033 """ 1034

1035 - def _value__get(self):

1036 """ 1037 Get/set the value of this select (the selected option). 1038 1039 If this is a multi-select, this is a set-like object that 1040 represents all the selected options. 1041 """ 1042 if self.multiple: 1043 return MultipleSelectOptions(self) 1044 for el in _options_xpath(self): 1045 if el.get('selected') is not None: 1046 value = el.get('value') 1047 if value is None: 1048 value = el.text or '' 1049 if value: 1050 value = value.strip() 1051 return value 1052 return None

1053

1054 - def _value__set(self, value):

1055 if self.multiple: 1056 if isinstance(value, basestring): 1057 raise TypeError( 1058 "You must pass in a sequence") 1059 self.value.clear() 1060 self.value.update(value) 1061 return 1062 if value is not None: 1063 value = value.strip() 1064 for el in _options_xpath(self): 1065 opt_value = el.get('value') 1066 if opt_value is None: 1067 opt_value = el.text or '' 1068 if opt_value: 1069 opt_value = opt_value.strip() 1070 if opt_value == value: 1071 checked_option = el 1072 break 1073 else: 1074 raise ValueError( 1075 "There is no option with the value of %r" % value) 1076 for el in _options_xpath(self): 1077 if 'selected' in el.attrib: 1078 del el.attrib['selected'] 1079 if value is not None: 1080 checked_option.set('selected', '')

1081

1082 - def _value__del(self):

1083 # FIXME: should del be allowed at all? 1084 if self.multiple: 1085 self.value.clear() 1086 else: 1087 self.value = None

1088 1089 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1090

1091 - def value_options(self):

1092 """ 1093 All the possible values this select can have (the ``value`` 1094 attribute of all the ``<option>`` elements. 1095 """ 1096 options = [] 1097 for el in _options_xpath(self): 1098 value = el.get('value') 1099 if value is None: 1100 value = el.text or '' 1101 if value: 1102 value = value.strip() 1103 options.append(value) 1104 return options

1105 value_options = property(value_options, doc=value_options.__doc__) 1106

1107 - def _multiple__get(self):

1108 """ 1109 Boolean attribute: is there a ``multiple`` attribute on this element. 1110 """ 1111 return 'multiple' in self.attrib

1112 - def _multiple__set(self, value):

1113 if value: 1114 self.set('multiple', '') 1115 elif 'multiple' in self.attrib: 1116 del self.attrib['multiple']

1117 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1118 1119 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1120

1121 -class MultipleSelectOptions(SetMixin):

1122 """ 1123 Represents all the selected options in a ``<select multiple>`` element. 1124 1125 You can add to this set-like option to select an option, or remove 1126 to unselect the option. 1127 """ 1128

1129 - def __init__(self, select):

1130 self.select = select

1131

1132 - def options(self):

1133 """ 1134 Iterator of all the ``<option>`` elements. 1135 """ 1136 return iter(_options_xpath(self.select))

1137 options = property(options) 1138

1139 - def __iter__(self):

1140 for option in self.options: 1141 if 'selected' in option.attrib: 1142 opt_value = option.get('value') 1143 if opt_value is None: 1144 opt_value = option.text or '' 1145 if opt_value: 1146 opt_value = opt_value.strip() 1147 yield opt_value

1148

1149 - def add(self, item):

1150 for option in self.options: 1151 opt_value = option.get('value') 1152 if opt_value is None: 1153 opt_value = option.text or '' 1154 if opt_value: 1155 opt_value = opt_value.strip() 1156 if opt_value == item: 1157 option.set('selected', '') 1158 break 1159 else: 1160 raise ValueError( 1161 "There is no option with the value %r" % item)

1162

1163 - def remove(self, item):

1164 for option in self.options: 1165 opt_value = option.get('value') 1166 if opt_value is None: 1167 opt_value = option.text or '' 1168 if opt_value: 1169 opt_value = opt_value.strip() 1170 if opt_value == item: 1171 if 'selected' in option.attrib: 1172 del option.attrib['selected'] 1173 else: 1174 raise ValueError( 1175 "The option %r is not currently selected" % item) 1176 break 1177 else: 1178 raise ValueError( 1179 "There is not option with the value %r" % item)

1180

1181 - def __repr__(self):

1182 return '<%s {%s} for select name=%r>' % ( 1183 self.__class__.__name__, 1184 ', '.join([repr(v) for v in self]), 1185 self.select.name)

1186

1187 -class RadioGroup(list):

1188 """ 1189 This object represents several ``<input type=radio>`` elements 1190 that have the same name. 1191 1192 You can use this like a list, but also use the property 1193 ``.value`` to check/uncheck inputs. Also you can use 1194 ``.value_options`` to get the possible values. 1195 """ 1196

1197 - def _value__get(self):

1198 """ 1199 Get/set the value, which checks the radio with that value (and 1200 unchecks any other value). 1201 """ 1202 for el in self: 1203 if 'checked' in el.attrib: 1204 return el.get('value') 1205 return None

1206

1207 - def _value__set(self, value):

1208 if value is not None: 1209 for el in self: 1210 if el.get('value') == value: 1211 checked_option = el 1212 break 1213 else: 1214 raise ValueError( 1215 "There is no radio input with the value %r" % value) 1216 for el in self: 1217 if 'checked' in el.attrib: 1218 del el.attrib['checked'] 1219 if value is not None: 1220 checked_option.set('checked', '')

1221

1222 - def _value__del(self):

1223 self.value = None

1224 1225 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1226

1227 - def value_options(self):

1228 """ 1229 Returns a list of all the possible values. 1230 """ 1231 return [el.get('value') for el in self]

1232 value_options = property(value_options, doc=value_options.__doc__) 1233

1234 - def __repr__(self):

1235 return '%s(%s)' % ( 1236 self.__class__.__name__, 1237 list.__repr__(self))

1238

1239 -class CheckboxGroup(list):

1240 """ 1241 Represents a group of checkboxes (``<input type=checkbox>``) that 1242 have the same name. 1243 1244 In addition to using this like a list, the ``.value`` attribute 1245 returns a set-like object that you can add to or remove from to 1246 check and uncheck checkboxes. You can also use ``.value_options`` 1247 to get the possible values. 1248 """ 1249

1250 - def _value__get(self):

1251 """ 1252 Return a set-like object that can be modified to check or 1253 uncheck individual checkboxes according to their value. 1254 """ 1255 return CheckboxValues(self)

1256 - def _value__set(self, value):

1257 self.value.clear() 1258 if not hasattr(value, '__iter__'): 1259 raise ValueError( 1260 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1261 % (self[0].name, value)) 1262 self.value.update(value)

1263 - def _value__del(self):

1264 self.value.clear()

1265 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1266

1267 - def __repr__(self):

1268 return '%s(%s)' % ( 1269 self.__class__.__name__, list.__repr__(self))

1270

1271 -class CheckboxValues(SetMixin):

1272 1273 """ 1274 Represents the values of the checked checkboxes in a group of 1275 checkboxes with the same name. 1276 """ 1277

1278 - def __init__(self, group):

1279 self.group = group

1280

1281 - def __iter__(self):

1282 return iter([ 1283 el.get('value') 1284 for el in self.group 1285 if 'checked' in el.attrib])

1286

1287 - def add(self, value):

1288 for el in self.group: 1289 if el.get('value') == value: 1290 el.set('checked', '') 1291 break 1292 else: 1293 raise KeyError("No checkbox with value %r" % value)

1294

1295 - def remove(self, value):

1296 for el in self.group: 1297 if el.get('value') == value: 1298 if 'checked' in el.attrib: 1299 del el.attrib['checked'] 1300 else: 1301 raise KeyError( 1302 "The checkbox with value %r was already unchecked" % value) 1303 break 1304 else: 1305 raise KeyError( 1306 "No checkbox with value %r" % value)

1307

1308 - def __repr__(self):

1309 return '<%s {%s} for checkboxes name=%r>' % ( 1310 self.__class__.__name__, 1311 ', '.join([repr(v) for v in self]), 1312 self.group.name)

1313

1314 -class InputElement(InputMixin, HtmlElement):

1315 """ 1316 Represents an ``<input>`` element. 1317 1318 You can get the type with ``.type`` (which is lower-cased and 1319 defaults to ``'text'``). 1320 1321 Also you can get and set the value with ``.value`` 1322 1323 Checkboxes and radios have the attribute ``input.checkable == 1324 True`` (for all others it is false) and a boolean attribute 1325 ``.checked``. 1326 1327 """ 1328 1329 ## FIXME: I'm a little uncomfortable with the use of .checked

1330 - def _value__get(self):

1331 """ 1332 Get/set the value of this element, using the ``value`` attribute. 1333 1334 Also, if this is a checkbox and it has no value, this defaults 1335 to ``'on'``. If it is a checkbox or radio that is not 1336 checked, this returns None. 1337 """ 1338 if self.checkable: 1339 if self.checked: 1340 return self.get('value') or 'on' 1341 else: 1342 return None 1343 return self.get('value')

1344 - def _value__set(self, value):

1345 if self.checkable: 1346 if not value: 1347 self.checked = False 1348 else: 1349 self.checked = True 1350 if isinstance(value, basestring): 1351 self.set('value', value) 1352 else: 1353 self.set('value', value)

1354 - def _value__del(self):

1355 if self.checkable: 1356 self.checked = False 1357 else: 1358 if 'value' in self.attrib: 1359 del self.attrib['value']

1360 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1361

1362 - def _type__get(self):

1363 """ 1364 Return the type of this element (using the type attribute). 1365 """ 1366 return self.get('type', 'text').lower()

1367 - def _type__set(self, value):

1368 self.set('type', value)

1369 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1370

1371 - def checkable(self):

1372 """ 1373 Boolean: can this element be checked? 1374 """ 1375 return self.type in ['checkbox', 'radio']

1376 checkable = property(checkable, doc=checkable.__doc__) 1377

1378 - def _checked__get(self):

1379 """ 1380 Boolean attribute to get/set the presence of the ``checked`` 1381 attribute. 1382 1383 You can only use this on checkable input types. 1384 """ 1385 if not self.checkable: 1386 raise AttributeError('Not a checkable input type') 1387 return 'checked' in self.attrib

1388 - def _checked__set(self, value):

1389 if not self.checkable: 1390 raise AttributeError('Not a checkable input type') 1391 if value: 1392 self.set('checked', '') 1393 else: 1394 if 'checked' in self.attrib: 1395 del self.attrib['checked']

1396 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1397 1398 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1399

1400 -class LabelElement(HtmlElement):

1401 """ 1402 Represents a ``<label>`` element. 1403 1404 Label elements are linked to other elements with their ``for`` 1405 attribute. You can access this element with ``label.for_element``. 1406 """ 1407

1408 - def _for_element__get(self):

1409 """ 1410 Get/set the element this label points to. Return None if it 1411 can't be found. 1412 """ 1413 id = self.get('for') 1414 if not id: 1415 return None 1416 return self.body.get_element_by_id(id)

1417 - def _for_element__set(self, other):

1418 id = other.get('id') 1419 if not id: 1420 raise TypeError( 1421 "Element %r has no id attribute" % other) 1422 self.set('for', id)

1423 - def _for_element__del(self):

1424 if 'id' in self.attrib: 1425 del self.attrib['id']

1426 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1427 doc=_for_element__get.__doc__)

1428 1429 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1430 1431 ############################################################ 1432 ## Serialization 1433 ############################################################ 1434

1435 -def html_to_xhtml(html):

1436 """Convert all tags in an HTML tree to XHTML by moving them to the 1437 XHTML namespace. 1438 """ 1439 try: 1440 html = html.getroot() 1441 except AttributeError: 1442 pass 1443 prefix = "{%s}" % XHTML_NAMESPACE 1444 for el in html.iter(): 1445 tag = el.tag 1446 if isinstance(tag, basestring): 1447 if tag[0] != '{': 1448 el.tag = prefix + tag

1449

1450 -def xhtml_to_html(xhtml):

1451 """Convert all tags in an XHTML tree to HTML by removing their 1452 XHTML namespace. 1453 """ 1454 try: 1455 xhtml = xhtml.getroot() 1456 except AttributeError: 1457 pass 1458 prefix = "{%s}" % XHTML_NAMESPACE 1459 prefix_len = len(prefix) 1460 for el in xhtml.iter(prefix + "*"): 1461 el.tag = el.tag[prefix_len:]

1462 1463 # This isn't a general match, but it's a match for what libxml2 1464 # specifically serialises: 1465 __str_replace_meta_content_type = re.compile( 1466 r'<meta http-equiv="Content-Type"[^>]*>').sub 1467 __bytes_replace_meta_content_type = re.compile( 1468 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1469

1470 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1471 encoding=None, method="html", with_tail=True, doctype=None):

1472 """Return an HTML string representation of the document. 1473 1474 Note: if include_meta_content_type is true this will create a 1475 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1476 regardless of the value of include_meta_content_type any existing 1477 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1478 1479 The ``encoding`` argument controls the output encoding (defauts to 1480 ASCII, with &#...; character references for any characters outside 1481 of ASCII). Note that you can pass the name ``'unicode'`` as 1482 ``encoding`` argument to serialise to a unicode string. 1483 1484 The ``method`` argument defines the output method. It defaults to 1485 'html', but can also be 'xml' for xhtml output, or 'text' to 1486 serialise to plain text without markup. 1487 1488 To leave out the tail text of the top-level element that is being 1489 serialised, pass ``with_tail=False``. 1490 1491 The ``doctype`` option allows passing in a plain string that will 1492 be serialised before the XML tree. Note that passing in non 1493 well-formed content here will make the XML output non well-formed. 1494 Also, an existing doctype in the document tree will not be removed 1495 when serialising an ElementTree instance. 1496 1497 Example:: 1498 1499 >>> from lxml import html 1500 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1501 1502 >>> html.tostring(root) 1503 b'<p>Hello<br>world!</p>' 1504 >>> html.tostring(root, method='html') 1505 b'<p>Hello<br>world!</p>' 1506 1507 >>> html.tostring(root, method='xml') 1508 b'<p>Hello<br/>world!</p>' 1509 1510 >>> html.tostring(root, method='text') 1511 b'Helloworld!' 1512 1513 >>> html.tostring(root, method='text', encoding=unicode) 1514 u'Helloworld!' 1515 1516 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1517 >>> html.tostring(root[0], method='text', encoding=unicode) 1518 u'Helloworld!TAIL' 1519 1520 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 1521 u'Helloworld!' 1522 1523 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1524 >>> html.tostring(doc, method='html', encoding=unicode) 1525 u'<html><body><p>Hello<br>world!</p></body></html>' 1526 1527 >>> print(html.tostring(doc, method='html', encoding=unicode, 1528 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1529 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1530 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1531 <html><body><p>Hello<br>world!</p></body></html> 1532 """ 1533 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1534 encoding=encoding, with_tail=with_tail, 1535 doctype=doctype) 1536 if method == 'html' and not include_meta_content_type: 1537 if isinstance(html, str): 1538 html = __str_replace_meta_content_type('', html) 1539 else: 1540 html = __bytes_replace_meta_content_type(bytes(), html) 1541 return html

1542 1543 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1544

1545 -def open_in_browser(doc, encoding=None):

1546 """ 1547 Open the HTML document in a web browser, saving it to a temporary 1548 file to open it. Note that this does not delete the file after 1549 use. This is mainly meant for debugging. 1550 """ 1551 import os 1552 import webbrowser 1553 import tempfile 1554 if not isinstance(doc, etree._ElementTree): 1555 doc = etree.ElementTree(doc) 1556 handle, fn = tempfile.mkstemp(suffix='.html') 1557 f = os.fdopen(handle, 'wb') 1558 try: 1559 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1560 finally: 1561 # we leak the file itself here, but we should at least close it 1562 f.close() 1563 url = 'file://' + fn.replace(os.path.sep, '/') 1564 print(url) 1565 webbrowser.open(url)

1566 1567 ################################################################################ 1568 # configure Element class lookup 1569 ################################################################################ 1570

1571 -class HTMLParser(etree.HTMLParser):

1572 """An HTML parser that is configured to return lxml.html Element 1573 objects. 1574 """

1575 - def __init__(self, **kwargs):

1576 super(HTMLParser, self).__init__(**kwargs) 1577 self.set_element_class_lookup(HtmlElementClassLookup())

1578

1579 -class XHTMLParser(etree.XMLParser):

1580 """An XML parser that is configured to return lxml.html Element 1581 objects. 1582 1583 Note that this parser is not really XHTML aware unless you let it 1584 load a DTD that declares the HTML entities. To do this, make sure 1585 you have the XHTML DTDs installed in your catalogs, and create the 1586 parser like this:: 1587 1588 >>> parser = XHTMLParser(load_dtd=True) 1589 1590 If you additionally want to validate the document, use this:: 1591 1592 >>> parser = XHTMLParser(dtd_validation=True) 1593 1594 For catalog support, see http://www.xmlsoft.org/catalog.html. 1595 """

1596 - def __init__(self, **kwargs):

1597 super(XHTMLParser, self).__init__(**kwargs) 1598 self.set_element_class_lookup(HtmlElementClassLookup())

1599

1600 -def Element(*args, **kw):

1601 """Create a new HTML Element. 1602 1603 This can also be used for XHTML documents. 1604 """ 1605 v = html_parser.makeelement(*args, **kw) 1606 return v

1607 1608 html_parser = HTMLParser() 1609 xhtml_parser = XHTMLParser() 1610

Source Code for Package lxml.html