lxml.html

1 """The ``lxml.html`` tool set for HTML handling. 2 """ 3 4 import threading 5 import re 6 try: 7 from urlparse import urljoin 8 except ImportError: 9 # Python 3 10 from urllib.parse import urljoin 11 import copy 12 from lxml import etree 13 from lxml.html import defs 14 from lxml import cssselect 15 from lxml.html._setmixin import SetMixin 16 try: 17 from UserDict import DictMixin 18 except ImportError: 19 # DictMixin was introduced in Python 2.4 20 from lxml.html._dictmixin import DictMixin 21 try: 22 set 23 except NameError: 24 # Python 2.3 25 from sets import Set as set 26 try: 27 bytes = __builtins__["bytes"] 28 except (KeyError, NameError): 29 # Python < 2.6 30 bytes = str 31 try: 32 unicode = __builtins__["unicode"] 33 except (KeyError, NameError): 34 # Python 3 35 unicode = str 36 try: 37 basestring = __builtins__["basestring"] 38 except (KeyError, NameError): 39 # Python 3 40 basestring = (str, bytes) 41

42 -def __fix_docstring(s):

43 import sys 44 if sys.version_info[0] >= 3: 45 sub = re.compile(r"^(\s*)u'", re.M).sub 46 else: 47 sub = re.compile(r"^(\s*)b'", re.M).sub 48 return sub(r"\1'", s)

49 50 __all__ = [ 51 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 52 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 53 'find_rel_links', 'find_class', 'make_links_absolute', 54 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 55 56 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 57 58 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 59 namespaces={'x':XHTML_NAMESPACE}) 60 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 63 namespaces={'x':XHTML_NAMESPACE}) 64 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 65 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 66 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 67 _collect_string_content = etree.XPath("string()") 68 _css_url_re = re.compile(r'url$(.*?)$', re.I) 69 _css_import_re = re.compile(r'@import "(.*?)"') 70 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 71 namespaces={'x':XHTML_NAMESPACE}) 72 _archive_re = re.compile(r'[^ ]+') 73

74 -def _transform_result(typ, result):

75 """Convert the result back into the input type. 76 """ 77 if issubclass(typ, bytes): 78 return tostring(result, encoding='utf-8') 79 elif issubclass(typ, unicode): 80 return tostring(result, encoding=unicode) 81 else: 82 return result

83

84 -def _nons(tag):

85 if isinstance(tag, basestring): 86 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 87 return tag.split('}')[-1] 88 return tag

89

90 -class HtmlMixin(object):

91

92 - def base_url(self):

93 """ 94 Returns the base URL, given when the page was parsed. 95 96 Use with ``urlparse.urljoin(el.base_url, href)`` to get 97 absolute URLs. 98 """ 99 return self.getroottree().docinfo.URL

100 base_url = property(base_url, doc=base_url.__doc__) 101

102 - def forms(self):

103 """ 104 Return a list of all the forms 105 """ 106 return _forms_xpath(self)

107 forms = property(forms, doc=forms.__doc__) 108

109 - def body(self):

110 """ 111 Return the <body> element. Can be called from a child element 112 to get the document's head. 113 """ 114 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]

115 body = property(body, doc=body.__doc__) 116

117 - def head(self):

118 """ 119 Returns the <head> element. Can be called from a child 120 element to get the document's head. 121 """ 122 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]

123 head = property(head, doc=head.__doc__) 124

125 - def _label__get(self):

126 """ 127 Get or set any <label> element associated with this element. 128 """ 129 id = self.get('id') 130 if not id: 131 return None 132 result = _label_xpath(self, id=id) 133 if not result: 134 return None 135 else: 136 return result[0]

137 - def _label__set(self, label):

138 id = self.get('id') 139 if not id: 140 raise TypeError( 141 "You cannot set a label for an element (%r) that has no id" 142 % self) 143 if _nons(label.tag) != 'label': 144 raise TypeError( 145 "You can only assign label to a label element (not %r)" 146 % label) 147 label.set('for', id)

148 - def _label__del(self):

149 label = self.label 150 if label is not None: 151 del label.attrib['for']

152 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 153

154 - def drop_tree(self):

155 """ 156 Removes this element from the tree, including its children and 157 text. The tail text is joined to the previous element or 158 parent. 159 """ 160 parent = self.getparent() 161 assert parent is not None 162 if self.tail: 163 previous = self.getprevious() 164 if previous is None: 165 parent.text = (parent.text or '') + self.tail 166 else: 167 previous.tail = (previous.tail or '') + self.tail 168 parent.remove(self)

169

170 - def drop_tag(self):

171 """ 172 Remove the tag, but not its children or text. The children and text 173 are merged into the parent. 174 175 Example:: 176 177 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 178 >>> h.find('.//b').drop_tag() 179 >>> print(tostring(h, encoding=unicode)) 180 <div>Hello World!</div> 181 """ 182 parent = self.getparent() 183 assert parent is not None 184 previous = self.getprevious() 185 if self.text and isinstance(self.tag, basestring): 186 # not a Comment, etc. 187 if previous is None: 188 parent.text = (parent.text or '') + self.text 189 else: 190 previous.tail = (previous.tail or '') + self.text 191 if self.tail: 192 if len(self): 193 last = self[-1] 194 last.tail = (last.tail or '') + self.tail 195 elif previous is None: 196 parent.text = (parent.text or '') + self.tail 197 else: 198 previous.tail = (previous.tail or '') + self.tail 199 index = parent.index(self) 200 parent[index:index+1] = self[:]

201

202 - def find_rel_links(self, rel):

203 """ 204 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 205 """ 206 rel = rel.lower() 207 return [el for el in _rel_links_xpath(self) 208 if el.get('rel').lower() == rel]

209

210 - def find_class(self, class_name):

211 """ 212 Find any elements with the given class name. 213 """ 214 return _class_xpath(self, class_name=class_name)

215

216 - def get_element_by_id(self, id, *default):

217 """ 218 Get the first element in a document with the given id. If none is 219 found, return the default argument if provided or raise KeyError 220 otherwise. 221 222 Note that there can be more than one element with the same id, 223 and this isn't uncommon in HTML documents found in the wild. 224 Browsers return only the first match, and this function does 225 the same. 226 """ 227 try: 228 # FIXME: should this check for multiple matches? 229 # browsers just return the first one 230 return _id_xpath(self, id=id)[0] 231 except IndexError: 232 if default: 233 return default[0] 234 else: 235 raise KeyError(id)

236

237 - def text_content(self):

238 """ 239 Return the text content of the tag (and the text in any children). 240 """ 241 return _collect_string_content(self)

242

243 - def cssselect(self, expr):

244 """ 245 Run the CSS expression on this element and its children, 246 returning a list of the results. 247 248 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 249 that pre-compiling the expression can provide a substantial 250 speedup. 251 """ 252 return cssselect.CSSSelector(expr)(self)

253 254 ######################################## 255 ## Link functions 256 ######################################## 257

258 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

259 """ 260 Make all links in the document absolute, given the 261 ``base_url`` for the document (the full URL where the document 262 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 263 264 If ``resolve_base_href`` is true, then any ``<base href>`` 265 tags in the document are used *and* removed from the document. 266 If it is false then any such tag is ignored. 267 """ 268 if base_url is None: 269 base_url = self.base_url 270 if base_url is None: 271 raise TypeError( 272 "No base_url given, and the document has no base_url") 273 if resolve_base_href: 274 self.resolve_base_href() 275 def link_repl(href): 276 return urljoin(base_url, href)

277 self.rewrite_links(link_repl)

278

279 - def resolve_base_href(self):

280 """ 281 Find any ``<base href>`` tag in the document, and apply its 282 values to all links found in the document. Also remove the 283 tag once it has been applied. 284 """ 285 base_href = None 286 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 287 for b in basetags: 288 base_href = b.get('href') 289 b.drop_tree() 290 if not base_href: 291 return 292 self.make_links_absolute(base_href, resolve_base_href=False)

293

294 - def iterlinks(self):

295 """ 296 Yield (element, attribute, link, pos), where attribute may be None 297 (indicating the link is in the text). ``pos`` is the position 298 where the link occurs; often 0, but sometimes something else in 299 the case of links in stylesheets or style tags. 300 301 Note: <base href> is *not* taken into account in any way. The 302 link you get is exactly the link in the document. 303 """ 304 link_attrs = defs.link_attrs 305 for el in self.iter(): 306 attribs = el.attrib 307 tag = _nons(el.tag) 308 if tag != 'object': 309 for attrib in link_attrs: 310 if attrib in attribs: 311 yield (el, attrib, attribs[attrib], 0) 312 elif tag == 'object': 313 codebase = None 314 ## <object> tags have attributes that are relative to 315 ## codebase 316 if 'codebase' in attribs: 317 codebase = el.get('codebase') 318 yield (el, 'codebase', codebase, 0) 319 for attrib in 'classid', 'data': 320 if attrib in attribs: 321 value = el.get(attrib) 322 if codebase is not None: 323 value = urljoin(codebase, value) 324 yield (el, attrib, value, 0) 325 if 'archive' in attribs: 326 for match in _archive_re.finditer(el.get('archive')): 327 value = match.group(0) 328 if codebase is not None: 329 value = urljoin(codebase, value) 330 yield (el, 'archive', value, match.start()) 331 if tag == 'param': 332 valuetype = el.get('valuetype') or '' 333 if valuetype.lower() == 'ref': 334 ## FIXME: while it's fine we *find* this link, 335 ## according to the spec we aren't supposed to 336 ## actually change the value, including resolving 337 ## it. It can also still be a link, even if it 338 ## doesn't have a valuetype="ref" (which seems to be the norm) 339 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 340 yield (el, 'value', el.get('value'), 0) 341 if tag == 'style' and el.text: 342 for match in _css_url_re.finditer(el.text): 343 yield (el, None, match.group(1), match.start(1)) 344 for match in _css_import_re.finditer(el.text): 345 yield (el, None, match.group(1), match.start(1)) 346 if 'style' in attribs: 347 for match in _css_url_re.finditer(attribs['style']): 348 yield (el, 'style', match.group(1), match.start(1))

349

350 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 351 base_href=None):

352 """ 353 Rewrite all the links in the document. For each link 354 ``link_repl_func(link)`` will be called, and the return value 355 will replace the old link. 356 357 Note that links may not be absolute (unless you first called 358 ``make_links_absolute()``), and may be internal (e.g., 359 ``'#anchor'``). They can also be values like 360 ``'mailto:email'`` or ``'javascript:expr'``. 361 362 If you give ``base_href`` then all links passed to 363 ``link_repl_func()`` will take that into account. 364 365 If the ``link_repl_func`` returns None, the attribute or 366 tag text will be removed completely. 367 """ 368 if base_href is not None: 369 # FIXME: this can be done in one pass with a wrapper 370 # around link_repl_func 371 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 372 elif resolve_base_href: 373 self.resolve_base_href() 374 for el, attrib, link, pos in self.iterlinks(): 375 new_link = link_repl_func(link.strip()) 376 if new_link == link: 377 continue 378 if new_link is None: 379 # Remove the attribute or element content 380 if attrib is None: 381 el.text = '' 382 else: 383 del el.attrib[attrib] 384 continue 385 if attrib is None: 386 new = el.text[:pos] + new_link + el.text[pos+len(link):] 387 el.text = new 388 else: 389 cur = el.attrib[attrib] 390 if not pos and len(cur) == len(link): 391 # Most common case 392 el.attrib[attrib] = new_link 393 else: 394 new = cur[:pos] + new_link + cur[pos+len(link):] 395 el.attrib[attrib] = new

396 397

398 -class _MethodFunc(object):

399 """ 400 An object that represents a method on an element as a function; 401 the function takes either an element or an HTML string. It 402 returns whatever the function normally returns, or if the function 403 works in-place (and so returns None) it returns a serialized form 404 of the resulting document. 405 """

406 - def __init__(self, name, copy=False, source_class=HtmlMixin):

407 self.name = name 408 self.copy = copy 409 self.__doc__ = getattr(source_class, self.name).__doc__

410 - def __call__(self, doc, *args, **kw):

411 result_type = type(doc) 412 if isinstance(doc, basestring): 413 if 'copy' in kw: 414 raise TypeError( 415 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 416 doc = fromstring(doc, **kw) 417 else: 418 if 'copy' in kw: 419 copy = kw.pop('copy') 420 else: 421 copy = self.copy 422 if copy: 423 doc = copy.deepcopy(doc) 424 meth = getattr(doc, self.name) 425 result = meth(*args, **kw) 426 # FIXME: this None test is a bit sloppy 427 if result is None: 428 # Then return what we got in 429 return _transform_result(result_type, doc) 430 else: 431 return result

432 433 find_rel_links = _MethodFunc('find_rel_links', copy=False) 434 find_class = _MethodFunc('find_class', copy=False) 435 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 436 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 437 iterlinks = _MethodFunc('iterlinks', copy=False) 438 rewrite_links = _MethodFunc('rewrite_links', copy=True) 439

440 -class HtmlComment(etree.CommentBase, HtmlMixin):

441 pass

442

443 -class HtmlElement(etree.ElementBase, HtmlMixin):

444 pass

445

446 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

447 pass

448

449 -class HtmlEntity(etree.EntityBase, HtmlMixin):

450 pass

451 452

453 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

454 """A lookup scheme for HTML Element classes. 455 456 To create a lookup instance with different Element classes, pass a tag 457 name mapping of Element classes in the ``classes`` keyword argument and/or 458 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 459 The special key '*' denotes a Mixin class that should be mixed into all 460 Element classes. 461 """ 462 _default_element_classes = {} 463

464 - def __init__(self, classes=None, mixins=None):

465 etree.CustomElementClassLookup.__init__(self) 466 if classes is None: 467 classes = self._default_element_classes.copy() 468 if mixins: 469 mixers = {} 470 for name, value in mixins: 471 if name == '*': 472 for n in classes.keys(): 473 mixers.setdefault(n, []).append(value) 474 else: 475 mixers.setdefault(name, []).append(value) 476 for name, mix_bases in mixers.items(): 477 cur = classes.get(name, HtmlElement) 478 bases = tuple(mix_bases + [cur]) 479 classes[name] = type(cur.__name__, bases, {}) 480 self._element_classes = classes

481

482 - def lookup(self, node_type, document, namespace, name):

483 if node_type == 'element': 484 return self._element_classes.get(name.lower(), HtmlElement) 485 elif node_type == 'comment': 486 return HtmlComment 487 elif node_type == 'PI': 488 return HtmlProcessingInstruction 489 elif node_type == 'entity': 490 return HtmlEntity 491 # Otherwise normal lookup 492 return None

493 494 ################################################################################ 495 # parsing 496 ################################################################################ 497

498 -def document_fromstring(html, parser=None, **kw):

499 if parser is None: 500 parser = html_parser 501 value = etree.fromstring(html, parser, **kw) 502 if value is None: 503 raise etree.ParserError( 504 "Document is empty") 505 return value

506

507 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 508 parser=None, **kw):

509 """ 510 Parses several HTML elements, returning a list of elements. 511 512 The first item in the list may be a string (though leading 513 whitespace is removed). If no_leading_text is true, then it will 514 be an error if there is leading text, and it will always be a list 515 of only elements. 516 517 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 518 """ 519 if parser is None: 520 parser = html_parser 521 # FIXME: check what happens when you give html with a body, head, etc. 522 start = html[:20].lstrip().lower() 523 if not start.startswith('<html') and not start.startswith('<!doctype'): 524 html = '<html><body>%s</body></html>' % html 525 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 526 assert _nons(doc.tag) == 'html' 527 bodies = [e for e in doc if _nons(e.tag) == 'body'] 528 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 529 body = bodies[0] 530 elements = [] 531 if no_leading_text and body.text and body.text.strip(): 532 raise etree.ParserError( 533 "There is leading text: %r" % body.text) 534 if body.text and body.text.strip(): 535 elements.append(body.text) 536 elements.extend(body) 537 # FIXME: removing the reference to the parent artificial document 538 # would be nice 539 return elements

540

541 -def fragment_fromstring(html, create_parent=False, base_url=None, 542 parser=None, **kw):

543 """ 544 Parses a single HTML element; it is an error if there is more than 545 one element, or if anything but whitespace precedes or follows the 546 element. 547 548 If create_parent is true (or is a tag name) then a parent node 549 will be created to encapsulate the HTML in a single element. 550 551 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 552 """ 553 if parser is None: 554 parser = html_parser 555 if create_parent: 556 if not isinstance(create_parent, basestring): 557 create_parent = 'div' 558 return fragment_fromstring('<%s>%s</%s>' % ( 559 create_parent, html, create_parent), 560 parser=parser, base_url=base_url, **kw) 561 elements = fragments_fromstring(html, parser=parser, no_leading_text=True, 562 base_url=base_url, **kw) 563 if not elements: 564 raise etree.ParserError( 565 "No elements found") 566 if len(elements) > 1: 567 raise etree.ParserError( 568 "Multiple elements found (%s)" 569 % ', '.join([_element_name(e) for e in elements])) 570 el = elements[0] 571 if el.tail and el.tail.strip(): 572 raise etree.ParserError( 573 "Element followed by text: %r" % el.tail) 574 el.tail = None 575 return el

576

577 -def fromstring(html, base_url=None, parser=None, **kw):

578 """ 579 Parse the html, returning a single element/document. 580 581 This tries to minimally parse the chunk of text, without knowing if it 582 is a fragment or a document. 583 584 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 585 """ 586 if parser is None: 587 parser = html_parser 588 start = html[:10].lstrip().lower() 589 if start.startswith('<html') or start.startswith('<!doctype'): 590 # Looks like a full HTML document 591 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 592 # otherwise, lets parse it out... 593 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 594 bodies = doc.findall('body') 595 if not bodies: 596 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 597 if bodies: 598 body = bodies[0] 599 if len(bodies) > 1: 600 # Somehow there are multiple bodies, which is bad, but just 601 # smash them into one body 602 for other_body in bodies[1:]: 603 if other_body.text: 604 if len(body): 605 body[-1].tail = (body[-1].tail or '') + other_body.text 606 else: 607 body.text = (body.text or '') + other_body.text 608 body.extend(other_body) 609 # We'll ignore tail 610 # I guess we are ignoring attributes too 611 other_body.drop_tree() 612 else: 613 body = None 614 heads = doc.findall('head') 615 if not heads: 616 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 617 if heads: 618 # Well, we have some sort of structure, so lets keep it all 619 head = heads[0] 620 if len(heads) > 1: 621 for other_head in heads[1:]: 622 head.extend(other_head) 623 # We don't care about text or tail in a head 624 other_head.drop_tree() 625 return doc 626 if (len(body) == 1 and (not body.text or not body.text.strip()) 627 and (not body[-1].tail or not body[-1].tail.strip())): 628 # The body has just one element, so it was probably a single 629 # element passed in 630 return body[0] 631 # Now we have a body which represents a bunch of tags which have the 632 # content that was passed in. We will create a fake container, which 633 # is the body tag, except <body> implies too much structure. 634 if _contains_block_level_tag(body): 635 body.tag = 'div' 636 else: 637 body.tag = 'span' 638 return body

639

640 -def parse(filename_or_url, parser=None, base_url=None, **kw):

641 """ 642 Parse a filename, URL, or file-like object into an HTML document 643 tree. Note: this returns a tree, not an element. Use 644 ``parse(...).getroot()`` to get the document root. 645 646 You can override the base URL with the ``base_url`` keyword. This 647 is most useful when parsing from a file-like object. 648 """ 649 if parser is None: 650 parser = html_parser 651 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

652

653 -def _contains_block_level_tag(el):

654 # FIXME: I could do this with XPath, but would that just be 655 # unnecessarily slow? 656 for el in el.iter(): 657 if _nons(el.tag) in defs.block_tags: 658 return True 659 return False

660

661 -def _element_name(el):

662 if isinstance(el, etree.CommentBase): 663 return 'comment' 664 elif isinstance(el, basestring): 665 return 'string' 666 else: 667 return _nons(el.tag)

668 669 ################################################################################ 670 # form handling 671 ################################################################################ 672

673 -class FormElement(HtmlElement):

674 """ 675 Represents a <form> element. 676 """ 677

678 - def inputs(self):

679 """ 680 Returns an accessor for all the input elements in the form. 681 682 See `InputGetter` for more information about the object. 683 """ 684 return InputGetter(self)

685 inputs = property(inputs, doc=inputs.__doc__) 686

687 - def _fields__get(self):

688 """ 689 Dictionary-like object that represents all the fields in this 690 form. You can set values in this dictionary to effect the 691 form. 692 """ 693 return FieldsDict(self.inputs)

694 - def _fields__set(self, value):

695 prev_keys = self.fields.keys() 696 for key, value in value.iteritems(): 697 if key in prev_keys: 698 prev_keys.remove(key) 699 self.fields[key] = value 700 for key in prev_keys: 701 if key is None: 702 # Case of an unnamed input; these aren't really 703 # expressed in form_values() anyway. 704 continue 705 self.fields[key] = None

706 707 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 708

709 - def _name(self):

710 if self.get('name'): 711 return self.get('name') 712 elif self.get('id'): 713 return '#' + self.get('id') 714 forms = self.body.findall('form') 715 if not forms: 716 forms = self.body.findall('{%s}form' % XHTML_NAMESPACE) 717 return str(forms.index(self))

718

719 - def form_values(self):

720 """ 721 Return a list of tuples of the field values for the form. 722 This is suitable to be passed to ``urllib.urlencode()``. 723 """ 724 results = [] 725 for el in self.inputs: 726 name = el.name 727 if not name: 728 continue 729 tag = _nons(el.tag) 730 if tag == 'textarea': 731 results.append((name, el.value)) 732 elif tag == 'select': 733 value = el.value 734 if el.multiple: 735 for v in value: 736 results.append((name, v)) 737 elif value is not None: 738 results.append((name, el.value)) 739 else: 740 assert tag == 'input', ( 741 "Unexpected tag: %r" % el) 742 if el.checkable and not el.checked: 743 continue 744 if el.type in ('submit', 'image', 'reset'): 745 continue 746 value = el.value 747 if value is not None: 748 results.append((name, el.value)) 749 return results

750

751 - def _action__get(self):

752 """ 753 Get/set the form's ``action`` attribute. 754 """ 755 base_url = self.base_url 756 action = self.get('action') 757 if base_url and action is not None: 758 return urljoin(base_url, action) 759 else: 760 return action

761 - def _action__set(self, value):

762 self.set('action', value)

763 - def _action__del(self):

764 if 'action' in self.attrib: 765 del self.attrib['action']

766 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 767

768 - def _method__get(self):

769 """ 770 Get/set the form's method. Always returns a capitalized 771 string, and defaults to ``'GET'`` 772 """ 773 return self.get('method', 'GET').upper()

774 - def _method__set(self, value):

775 self.set('method', value.upper())

776 method = property(_method__get, _method__set, doc=_method__get.__doc__)

777 778 HtmlElementClassLookup._default_element_classes['form'] = FormElement 779

780 -def submit_form(form, extra_values=None, open_http=None):

781 """ 782 Helper function to submit a form. Returns a file-like object, as from 783 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 784 which shows the URL if there were any redirects. 785 786 You can use this like:: 787 788 form = doc.forms[0] 789 form.inputs['foo'].value = 'bar' # etc 790 response = form.submit() 791 doc = parse(response) 792 doc.make_links_absolute(response.geturl()) 793 794 To change the HTTP requester, pass a function as ``open_http`` keyword 795 argument that opens the URL for you. The function must have the following 796 signature:: 797 798 open_http(method, URL, values) 799 800 The action is one of 'GET' or 'POST', the URL is the target URL as a 801 string, and the values are a sequence of ``(name, value)`` tuples with the 802 form data. 803 """ 804 values = form.form_values() 805 if extra_values: 806 if hasattr(extra_values, 'items'): 807 extra_values = extra_values.items() 808 values.extend(extra_values) 809 if open_http is None: 810 open_http = open_http_urllib 811 return open_http(form.method, form.action, values)

812

813 -def open_http_urllib(method, url, values):

814 import urllib 815 ## FIXME: should test that it's not a relative URL or something 816 if method == 'GET': 817 if '?' in url: 818 url += '&' 819 else: 820 url += '?' 821 url += urllib.urlencode(values) 822 data = None 823 else: 824 data = urllib.urlencode(values) 825 return urllib.urlopen(url, data)

826

827 -class FieldsDict(DictMixin):

828

829 - def __init__(self, inputs):

830 self.inputs = inputs

831 - def __getitem__(self, item):

832 return self.inputs[item].value

833 - def __setitem__(self, item, value):

834 self.inputs[item].value = value

835 - def __delitem__(self, item):

836 raise KeyError( 837 "You cannot remove keys from ElementDict")

838 - def keys(self):

839 return self.inputs.keys()

840 - def __contains__(self, item):

841 return item in self.inputs

842

843 - def __repr__(self):

844 return '<%s for form %s>' % ( 845 self.__class__.__name__, 846 self.inputs.form._name())

847

848 -class InputGetter(object):

849 850 """ 851 An accessor that represents all the input fields in a form. 852 853 You can get fields by name from this, with 854 ``form.inputs['field_name']``. If there are a set of checkboxes 855 with the same name, they are returned as a list (a `CheckboxGroup` 856 which also allows value setting). Radio inputs are handled 857 similarly. 858 859 You can also iterate over this to get all input elements. This 860 won't return the same thing as if you get all the names, as 861 checkboxes and radio elements are returned individually. 862 """ 863 864 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 865 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 866

867 - def __init__(self, form):

868 self.form = form

869

870 - def __repr__(self):

871 return '<%s for form %s>' % ( 872 self.__class__.__name__, 873 self.form._name())

874 875 ## FIXME: there should be more methods, and it's unclear if this is 876 ## a dictionary-like object or list-like object 877

878 - def __getitem__(self, name):

879 results = self._name_xpath(self.form, name=name) 880 if results: 881 type = results[0].get('type') 882 if type == 'radio' and len(results) > 1: 883 group = RadioGroup(results) 884 group.name = name 885 return group 886 elif type == 'checkbox' and len(results) > 1: 887 group = CheckboxGroup(results) 888 group.name = name 889 return group 890 else: 891 # I don't like throwing away elements like this 892 return results[0] 893 else: 894 raise KeyError( 895 "No input element with the name %r" % name)

896

897 - def __contains__(self, name):

898 results = self._name_xpath(self.form, name=name) 899 return bool(results)

900

901 - def keys(self):

902 names = set() 903 for el in self: 904 names.add(el.name) 905 if None in names: 906 names.remove(None) 907 return list(names)

908

909 - def __iter__(self):

910 ## FIXME: kind of dumb to turn a list into an iterator, only 911 ## to have it likely turned back into a list again :( 912 return iter(self._all_xpath(self.form))

913

914 -class InputMixin(object):

915 916 """ 917 Mix-in for all input elements (input, select, and textarea) 918 """ 919 920

921 - def _name__get(self):

922 """ 923 Get/set the name of the element 924 """ 925 return self.get('name')

926 - def _name__set(self, value):

927 self.set('name', value)

928 - def _name__del(self):

929 if 'name' in self.attrib: 930 del self.attrib['name']

931 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 932

933 - def __repr__(self):

934 type = getattr(self, 'type', None) 935 if type: 936 type = ' type=%r' % type 937 else: 938 type = '' 939 return '<%s %x name=%r%s>' % ( 940 self.__class__.__name__, id(self), self.name, type)

941

942 -class TextareaElement(InputMixin, HtmlElement):

943 """ 944 ``<textarea>`` element. You can get the name with ``.name`` and 945 get/set the value with ``.value`` 946 """ 947

948 - def _value__get(self):

949 """ 950 Get/set the value (which is the contents of this element) 951 """ 952 return self.text or ''

953 - def _value__set(self, value):

954 self.text = value

955 - def _value__del(self):

956 self.text = ''

957 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

958 959 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 960

961 -class SelectElement(InputMixin, HtmlElement):

962 """ 963 ``<select>`` element. You can get the name with ``.name``. 964 965 ``.value`` will be the value of the selected option, unless this 966 is a multi-select element (``<select multiple>``), in which case 967 it will be a set-like object. In either case ``.value_options`` 968 gives the possible values. 969 970 The boolean attribute ``.multiple`` shows if this is a 971 multi-select. 972 """ 973

974 - def _value__get(self):

975 """ 976 Get/set the value of this select (the selected option). 977 978 If this is a multi-select, this is a set-like object that 979 represents all the selected options. 980 """ 981 if self.multiple: 982 return MultipleSelectOptions(self) 983 for el in _options_xpath(self): 984 if 'selected' in el.attrib: 985 value = el.get('value') 986 # FIXME: If value is None, what to return?, get_text()? 987 return value 988 return None

989

990 - def _value__set(self, value):

991 if self.multiple: 992 if isinstance(value, basestring): 993 raise TypeError( 994 "You must pass in a sequence") 995 self.value.clear() 996 self.value.update(value) 997 return 998 if value is not None: 999 for el in _options_xpath(self): 1000 # FIXME: also if el.get('value') is None? 1001 if el.get('value') == value: 1002 checked_option = el 1003 break 1004 else: 1005 raise ValueError( 1006 "There is no option with the value of %r" % value) 1007 for el in _options_xpath(self): 1008 if 'selected' in el.attrib: 1009 del el.attrib['selected'] 1010 if value is not None: 1011 checked_option.set('selected', '')

1012

1013 - def _value__del(self):

1014 # FIXME: should del be allowed at all? 1015 if self.multiple: 1016 self.value.clear() 1017 else: 1018 self.value = None

1019 1020 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1021

1022 - def value_options(self):

1023 """ 1024 All the possible values this select can have (the ``value`` 1025 attribute of all the ``<option>`` elements. 1026 """ 1027 return [el.get('value') for el in _options_xpath(self)]

1028 value_options = property(value_options, doc=value_options.__doc__) 1029

1030 - def _multiple__get(self):

1031 """ 1032 Boolean attribute: is there a ``multiple`` attribute on this element. 1033 """ 1034 return 'multiple' in self.attrib

1035 - def _multiple__set(self, value):

1036 if value: 1037 self.set('multiple', '') 1038 elif 'multiple' in self.attrib: 1039 del self.attrib['multiple']

1040 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

1041 1042 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1043

1044 -class MultipleSelectOptions(SetMixin):

1045 """ 1046 Represents all the selected options in a ``<select multiple>`` element. 1047 1048 You can add to this set-like option to select an option, or remove 1049 to unselect the option. 1050 """ 1051

1052 - def __init__(self, select):

1053 self.select = select

1054

1055 - def options(self):

1056 """ 1057 Iterator of all the ``<option>`` elements. 1058 """ 1059 return iter(_options_xpath(self.select))

1060 options = property(options) 1061

1062 - def __iter__(self):

1063 for option in self.options: 1064 yield option.get('value')

1065

1066 - def add(self, item):

1067 for option in self.options: 1068 if option.get('value') == item: 1069 option.set('selected', '') 1070 break 1071 else: 1072 raise ValueError( 1073 "There is no option with the value %r" % item)

1074

1075 - def remove(self, item):

1076 for option in self.options: 1077 if option.get('value') == item: 1078 if 'selected' in option.attrib: 1079 del option.attrib['selected'] 1080 else: 1081 raise ValueError( 1082 "The option %r is not currently selected" % item) 1083 break 1084 else: 1085 raise ValueError( 1086 "There is not option with the value %r" % item)

1087

1088 - def __repr__(self):

1089 return '<%s {%s} for select name=%r>' % ( 1090 self.__class__.__name__, 1091 ', '.join([repr(v) for v in self]), 1092 self.select.name)

1093

1094 -class RadioGroup(list):

1095 """ 1096 This object represents several ``<input type=radio>`` elements 1097 that have the same name. 1098 1099 You can use this like a list, but also use the property 1100 ``.value`` to check/uncheck inputs. Also you can use 1101 ``.value_options`` to get the possible values. 1102 """ 1103

1104 - def _value__get(self):

1105 """ 1106 Get/set the value, which checks the radio with that value (and 1107 unchecks any other value). 1108 """ 1109 for el in self: 1110 if 'checked' in el.attrib: 1111 return el.get('value') 1112 return None

1113

1114 - def _value__set(self, value):

1115 if value is not None: 1116 for el in self: 1117 if el.get('value') == value: 1118 checked_option = el 1119 break 1120 else: 1121 raise ValueError( 1122 "There is no radio input with the value %r" % value) 1123 for el in self: 1124 if 'checked' in el.attrib: 1125 del el.attrib['checked'] 1126 if value is not None: 1127 checked_option.set('checked', '')

1128

1129 - def _value__del(self):

1130 self.value = None

1131 1132 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1133

1134 - def value_options(self):

1135 """ 1136 Returns a list of all the possible values. 1137 """ 1138 return [el.get('value') for el in self]

1139 value_options = property(value_options, doc=value_options.__doc__) 1140

1141 - def __repr__(self):

1142 return '%s(%s)' % ( 1143 self.__class__.__name__, 1144 list.__repr__(self))

1145

1146 -class CheckboxGroup(list):

1147 """ 1148 Represents a group of checkboxes (``<input type=checkbox>``) that 1149 have the same name. 1150 1151 In addition to using this like a list, the ``.value`` attribute 1152 returns a set-like object that you can add to or remove from to 1153 check and uncheck checkboxes. You can also use ``.value_options`` 1154 to get the possible values. 1155 """ 1156

1157 - def _value__get(self):

1158 """ 1159 Return a set-like object that can be modified to check or 1160 uncheck individual checkboxes according to their value. 1161 """ 1162 return CheckboxValues(self)

1163 - def _value__set(self, value):

1164 self.value.clear() 1165 if not hasattr(value, '__iter__'): 1166 raise ValueError( 1167 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1168 % (self[0].name, value)) 1169 self.value.update(value)

1170 - def _value__del(self):

1171 self.value.clear()

1172 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1173

1174 - def __repr__(self):

1175 return '%s(%s)' % ( 1176 self.__class__.__name__, list.__repr__(self))

1177

1178 -class CheckboxValues(SetMixin):

1179 1180 """ 1181 Represents the values of the checked checkboxes in a group of 1182 checkboxes with the same name. 1183 """ 1184

1185 - def __init__(self, group):

1186 self.group = group

1187

1188 - def __iter__(self):

1189 return iter([ 1190 el.get('value') 1191 for el in self.group 1192 if 'checked' in el.attrib])

1193

1194 - def add(self, value):

1195 for el in self.group: 1196 if el.get('value') == value: 1197 el.set('checked', '') 1198 break 1199 else: 1200 raise KeyError("No checkbox with value %r" % value)

1201

1202 - def remove(self, value):

1203 for el in self.group: 1204 if el.get('value') == value: 1205 if 'checked' in el.attrib: 1206 del el.attrib['checked'] 1207 else: 1208 raise KeyError( 1209 "The checkbox with value %r was already unchecked" % value) 1210 break 1211 else: 1212 raise KeyError( 1213 "No checkbox with value %r" % value)

1214

1215 - def __repr__(self):

1216 return '<%s {%s} for checkboxes name=%r>' % ( 1217 self.__class__.__name__, 1218 ', '.join([repr(v) for v in self]), 1219 self.group.name)

1220

1221 -class InputElement(InputMixin, HtmlElement):

1222 """ 1223 Represents an ``<input>`` element. 1224 1225 You can get the type with ``.type`` (which is lower-cased and 1226 defaults to ``'text'``). 1227 1228 Also you can get and set the value with ``.value`` 1229 1230 Checkboxes and radios have the attribute ``input.checkable == 1231 True`` (for all others it is false) and a boolean attribute 1232 ``.checked``. 1233 1234 """ 1235 1236 ## FIXME: I'm a little uncomfortable with the use of .checked

1237 - def _value__get(self):

1238 """ 1239 Get/set the value of this element, using the ``value`` attribute. 1240 1241 Also, if this is a checkbox and it has no value, this defaults 1242 to ``'on'``. If it is a checkbox or radio that is not 1243 checked, this returns None. 1244 """ 1245 if self.checkable: 1246 if self.checked: 1247 return self.get('value') or 'on' 1248 else: 1249 return None 1250 return self.get('value')

1251 - def _value__set(self, value):

1252 if self.checkable: 1253 if not value: 1254 self.checked = False 1255 else: 1256 self.checked = True 1257 if isinstance(value, basestring): 1258 self.set('value', value) 1259 else: 1260 self.set('value', value)

1261 - def _value__del(self):

1262 if self.checkable: 1263 self.checked = False 1264 else: 1265 if 'value' in self.attrib: 1266 del self.attrib['value']

1267 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1268

1269 - def _type__get(self):

1270 """ 1271 Return the type of this element (using the type attribute). 1272 """ 1273 return self.get('type', 'text').lower()

1274 - def _type__set(self, value):

1275 self.set('type', value)

1276 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1277

1278 - def checkable(self):

1279 """ 1280 Boolean: can this element be checked? 1281 """ 1282 return self.type in ['checkbox', 'radio']

1283 checkable = property(checkable, doc=checkable.__doc__) 1284

1285 - def _checked__get(self):

1286 """ 1287 Boolean attribute to get/set the presence of the ``checked`` 1288 attribute. 1289 1290 You can only use this on checkable input types. 1291 """ 1292 if not self.checkable: 1293 raise AttributeError('Not a checkable input type') 1294 return 'checked' in self.attrib

1295 - def _checked__set(self, value):

1296 if not self.checkable: 1297 raise AttributeError('Not a checkable input type') 1298 if value: 1299 self.set('checked', '') 1300 else: 1301 if 'checked' in self.attrib: 1302 del self.attrib['checked']

1303 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1304 1305 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1306

1307 -class LabelElement(HtmlElement):

1308 """ 1309 Represents a ``<label>`` element. 1310 1311 Label elements are linked to other elements with their ``for`` 1312 attribute. You can access this element with ``label.for_element``. 1313 """ 1314

1315 - def _for_element__get(self):

1316 """ 1317 Get/set the element this label points to. Return None if it 1318 can't be found. 1319 """ 1320 id = self.get('for') 1321 if not id: 1322 return None 1323 return self.body.get_element_by_id(id)

1324 - def _for_element__set(self, other):

1325 id = other.get('id') 1326 if not id: 1327 raise TypeError( 1328 "Element %r has no id attribute" % other) 1329 self.set('for', id)

1330 - def _for_element__del(self):

1331 if 'id' in self.attrib: 1332 del self.attrib['id']

1333 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1334 doc=_for_element__get.__doc__)

1335 1336 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1337 1338 ############################################################ 1339 ## Serialization 1340 ############################################################ 1341

1342 -def html_to_xhtml(html):

1343 """Convert all tags in an HTML tree to XHTML by moving them to the 1344 XHTML namespace. 1345 """ 1346 try: 1347 html = html.getroot() 1348 except AttributeError: 1349 pass 1350 prefix = "{%s}" % XHTML_NAMESPACE 1351 for el in html.iter(): 1352 tag = el.tag 1353 if isinstance(tag, basestring): 1354 if tag[0] != '{': 1355 el.tag = prefix + tag

1356

1357 -def xhtml_to_html(xhtml):

1358 """Convert all tags in an XHTML tree to HTML by removing their 1359 XHTML namespace. 1360 """ 1361 try: 1362 xhtml = xhtml.getroot() 1363 except AttributeError: 1364 pass 1365 prefix = "{%s}" % XHTML_NAMESPACE 1366 prefix_len = len(prefix) 1367 for el in xhtml.iter(prefix + "*"): 1368 el.tag = el.tag[prefix_len:]

1369 1370 # This isn't a general match, but it's a match for what libxml2 1371 # specifically serialises: 1372 __replace_meta_content_type = re.compile( 1373 r'<meta http-equiv="Content-Type".*?>').sub 1374

1375 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1376 encoding=None, method="html"):

1377 """Return an HTML string representation of the document. 1378 1379 Note: the 'include_meta_content_type' argument exists purely for 1380 compatibility and does not serve any purpose. 1381 1382 The ``encoding`` argument controls the output encoding (defauts to 1383 ASCII, with &#...; character references for any characters outside 1384 of ASCII). 1385 1386 The ``method`` argument defines the output method. It defaults to 1387 'html', but can also be 'xml' for xhtml output, or 'text' to 1388 serialise to plain text without markup. Note that you can pass 1389 the builtin ``unicode`` type as ``encoding`` argument to serialise 1390 to a unicode string. 1391 1392 Example:: 1393 1394 >>> from lxml import html 1395 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1396 1397 >>> html.tostring(root) 1398 b'<p>Hello<br>world!</p>' 1399 >>> html.tostring(root, method='html') 1400 b'<p>Hello<br>world!</p>' 1401 1402 >>> html.tostring(root, method='xml') 1403 b'<p>Hello<br/>world!</p>' 1404 1405 >>> html.tostring(root, method='text') 1406 b'Helloworld!' 1407 1408 >>> html.tostring(root, method='text', encoding=unicode) 1409 u'Helloworld!' 1410 """ 1411 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1412 encoding=encoding) 1413 if not include_meta_content_type: 1414 html = __replace_meta_content_type('', html) 1415 return html

1416 1417 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1418

1419 -def open_in_browser(doc):

1420 """ 1421 Open the HTML document in a web browser (saving it to a temporary 1422 file to open it). 1423 """ 1424 import os 1425 import webbrowser 1426 try: 1427 write_doc = doc.write 1428 except AttributeError: 1429 write_doc = etree.ElementTree(element=doc).write 1430 fn = os.tempnam() + '.html' 1431 write_doc(fn, method="html") 1432 url = 'file://' + fn.replace(os.path.sep, '/') 1433 print(url) 1434 webbrowser.open(url)

1435 1436 ################################################################################ 1437 # configure Element class lookup 1438 ################################################################################ 1439

1440 -class HTMLParser(etree.HTMLParser):

1441 - def __init__(self, **kwargs):

1442 super(HTMLParser, self).__init__(**kwargs) 1443 self.set_element_class_lookup(HtmlElementClassLookup())

1444

1445 -class XHTMLParser(etree.XMLParser):

1446 - def __init__(self, **kwargs):

1447 super(XHTMLParser, self).__init__(**kwargs) 1448 self.set_element_class_lookup(HtmlElementClassLookup())

1449

1450 -def Element(*args, **kw):

1451 """Create a new HTML Element. 1452 1453 This can also be used for XHTML documents. 1454 """ 1455 v = html_parser.makeelement(*args, **kw) 1456 return v

1457 1458 html_parser = HTMLParser() 1459 xhtml_parser = XHTMLParser() 1460

Source Code for Package lxml.html