lxml.html

1 """The ``lxml.html`` tool set for HTML handling. 2 """ 3 4 import threading 5 import re 6 import urlparse 7 import copy 8 from lxml import etree 9 from lxml.html import defs 10 from lxml import cssselect 11 from lxml.html._setmixin import SetMixin 12 try: 13 from UserDict import DictMixin 14 except ImportError: 15 # DictMixin was introduced in Python 2.4 16 from lxml.html._dictmixin import DictMixin 17 try: 18 set 19 except NameError: 20 from sets import Set as set 21 22 __all__ = [ 23 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 24 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 25 'find_rel_links', 'find_class', 'make_links_absolute', 26 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 27 28 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 29 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 30 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 31 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 32 _collect_string_content = etree.XPath("string()") 33 _css_url_re = re.compile(r'url$(.*?)$', re.I) 34 _css_import_re = re.compile(r'@import "(.*?)"') 35 _label_xpath = etree.XPath("//label[@for=$id]") 36 _archive_re = re.compile(r'[^ ]+') 37

38 -class HtmlMixin(object):

39

40 - def base_url(self):

41 """ 42 Returns the base URL, given when the page was parsed. 43 44 Use with ``urlparse.urljoin(el.base_url, href)`` to get 45 absolute URLs. 46 """ 47 return self.getroottree().docinfo.URL

48 base_url = property(base_url, doc=base_url.__doc__) 49

50 - def forms(self):

51 """ 52 Return a list of all the forms 53 """ 54 return list(self.getiterator('form'))

55 forms = property(forms, doc=forms.__doc__) 56

57 - def body(self):

58 """ 59 Return the <body> element. Can be called from a child element 60 to get the document's head. 61 """ 62 return self.xpath('//body')[0]

63 body = property(body, doc=body.__doc__) 64

65 - def head(self):

66 """ 67 Returns the <head> element. Can be called from a child 68 element to get the document's head. 69 """ 70 return self.xpath('//head')[0]

71 head = property(head, doc=head.__doc__) 72

73 - def _label__get(self):

74 """ 75 Get or set any <label> element associated with this element. 76 """ 77 id = self.get('id') 78 if not id: 79 return None 80 result = _label_xpath(self, id=id) 81 if not result: 82 return None 83 else: 84 return result[0]

85 - def _label__set(self, label):

86 id = self.get('id') 87 if not id: 88 raise TypeError( 89 "You cannot set a label for an element (%r) that has no id" 90 % self) 91 if not label.tag == 'label': 92 raise TypeError( 93 "You can only assign label to a label element (not %r)" 94 % label) 95 label.set('for', id)

96 - def _label__del(self):

97 label = self.label 98 if label is not None: 99 del label.attrib['for']

100 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 101

102 - def drop_tree(self):

103 """ 104 Removes this element from the tree, including its children and 105 text. The tail text is joined to the previous element or 106 parent. 107 """ 108 parent = self.getparent() 109 assert parent is not None 110 if self.tail: 111 previous = self.getprevious() 112 if previous is None: 113 parent.text = (parent.text or '') + self.tail 114 else: 115 previous.tail = (previous.tail or '') + self.tail 116 parent.remove(self)

117

118 - def drop_tag(self):

119 """ 120 Remove the tag, but not its children or text. The children and text 121 are merged into the parent. 122 123 Example:: 124 125 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 126 >>> h.find('.//b').drop_tag() 127 >>> print tostring(h) 128 <div>Hello World!</div> 129 """ 130 parent = self.getparent() 131 assert parent is not None 132 previous = self.getprevious() 133 if self.text and isinstance(self.tag, basestring): 134 # not a Comment, etc. 135 if previous is None: 136 parent.text = (parent.text or '') + self.text 137 else: 138 previous.tail = (previous.tail or '') + self.text 139 if self.tail: 140 if len(self): 141 last = self[-1] 142 last.tail = (last.tail or '') + self.tail 143 elif previous is None: 144 parent.text = (parent.text or '') + self.tail 145 else: 146 previous.tail = (previous.tail or '') + self.tail 147 index = parent.index(self) 148 parent[index:index+1] = self[:]

149

150 - def find_rel_links(self, rel):

151 """ 152 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements. 153 """ 154 rel = rel.lower() 155 return [el for el in _rel_links_xpath(self) 156 if el.get('rel').lower() == rel]

157

158 - def find_class(self, class_name):

159 """ 160 Find any elements with the given class name. 161 """ 162 return _class_xpath(self, class_name=class_name)

163

164 - def get_element_by_id(self, id, *default):

165 """ 166 Get the first element in a document with the given id. If none is 167 found, return the default argument if provided or raise KeyError 168 otherwise. 169 170 Note that there can be more than one element with the same id, 171 and this isn't uncommon in HTML documents found in the wild. 172 Browsers return only the first match, and this function does 173 the same. 174 """ 175 try: 176 # FIXME: should this check for multiple matches? 177 # browsers just return the first one 178 return _id_xpath(self, id=id)[0] 179 except IndexError: 180 if default: 181 return default[0] 182 else: 183 raise KeyError, id

184

185 - def text_content(self):

186 """ 187 Return the text content of the tag (and the text in any children). 188 """ 189 return _collect_string_content(self)

190

191 - def cssselect(self, expr):

192 """ 193 Run the CSS expression on this element and its children, 194 returning a list of the results. 195 196 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 197 that pre-compiling the expression can provide a substantial 198 speedup. 199 """ 200 return cssselect.CSSSelector(expr)(self)

201 202 ######################################## 203 ## Link functions 204 ######################################## 205

206 - def make_links_absolute(self, base_url=None, resolve_base_href=True):

207 """ 208 Make all links in the document absolute, given the 209 ``base_url`` for the document (the full URL where the document 210 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. 211 212 If ``resolve_base_href`` is true, then any ``<base href>`` 213 tags in the document are used *and* removed from the document. 214 If it is false then any such tag is ignored. 215 """ 216 if base_url is None: 217 base_url = self.base_url 218 if base_url is None: 219 raise TypeError( 220 "No base_url given, and the document has no base_url") 221 if resolve_base_href: 222 self.resolve_base_href() 223 def link_repl(href): 224 return urlparse.urljoin(base_url, href)

225 self.rewrite_links(link_repl)

226

227 - def resolve_base_href(self):

228 """ 229 Find any ``<base href>`` tag in the document, and apply its 230 values to all links found in the document. Also remove the 231 tag once it has been applied. 232 """ 233 base_href = None 234 basetags = self.xpath('//base[@href]') 235 for b in basetags: 236 base_href = b.get('href') 237 b.drop_tree() 238 if not base_href: 239 return 240 self.make_links_absolute(base_href, resolve_base_href=False)

241

242 - def iterlinks(self):

243 """ 244 Yield (element, attribute, link, pos), where attribute may be None 245 (indicating the link is in the text). ``pos`` is the position 246 where the link occurs; often 0, but sometimes something else in 247 the case of links in stylesheets or style tags. 248 249 Note: <base href> is *not* taken into account in any way. The 250 link you get is exactly the link in the document. 251 """ 252 link_attrs = defs.link_attrs 253 for el in self.getiterator(): 254 attribs = el.attrib 255 if el.tag != 'object': 256 for attrib in link_attrs: 257 if attrib in attribs: 258 yield (el, attrib, attribs[attrib], 0) 259 elif el.tag == 'object': 260 codebase = None 261 ## <object> tags have attributes that are relative to 262 ## codebase 263 if 'codebase' in attribs: 264 codebase = el.get('codebase') 265 yield (el, 'codebase', codebase, 0) 266 for attrib in 'classid', 'data': 267 if attrib in attribs: 268 value = el.get(attrib) 269 if codebase is not None: 270 value = urlparse.urljoin(codebase, value) 271 yield (el, attrib, value, 0) 272 if 'archive' in attribs: 273 for match in _archive_re.finditer(el.get('archive')): 274 value = match.group(0) 275 if codebase is not None: 276 value = urlparse.urljoin(codebase, value) 277 yield (el, 'archive', value, match.start()) 278 if el.tag == 'param': 279 valuetype = el.get('valuetype') or '' 280 if valuetype.lower() == 'ref': 281 ## FIXME: while it's fine we *find* this link, 282 ## according to the spec we aren't supposed to 283 ## actually change the value, including resolving 284 ## it. It can also still be a link, even if it 285 ## doesn't have a valuetype="ref" (which seems to be the norm) 286 ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype 287 yield (el, 'value', el.get('value'), 0) 288 if el.tag == 'style' and el.text: 289 for match in _css_url_re.finditer(el.text): 290 yield (el, None, match.group(1), match.start(1)) 291 for match in _css_import_re.finditer(el.text): 292 yield (el, None, match.group(1), match.start(1)) 293 if 'style' in attribs: 294 for match in _css_url_re.finditer(attribs['style']): 295 yield (el, 'style', match.group(1), match.start(1))

296

297 - def rewrite_links(self, link_repl_func, resolve_base_href=True, 298 base_href=None):

299 """ 300 Rewrite all the links in the document. For each link 301 ``link_repl_func(link)`` will be called, and the return value 302 will replace the old link. 303 304 Note that links may not be absolute (unless you first called 305 ``make_links_absolute()``), and may be internal (e.g., 306 ``'#anchor'``). They can also be values like 307 ``'mailto:email'`` or ``'javascript:expr'``. 308 309 If you give ``base_href`` then all links passed to 310 ``link_repl_func()`` will take that into account. 311 312 If the ``link_repl_func`` returns None, the attribute or 313 tag text will be removed completely. 314 """ 315 if base_href is not None: 316 # FIXME: this can be done in one pass with a wrapper 317 # around link_repl_func 318 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href) 319 elif resolve_base_href: 320 self.resolve_base_href() 321 for el, attrib, link, pos in self.iterlinks(): 322 new_link = link_repl_func(link.strip()) 323 if new_link == link: 324 continue 325 if new_link is None: 326 # Remove the attribute or element content 327 if attrib is None: 328 el.text = '' 329 else: 330 del el.attrib[attrib] 331 continue 332 if attrib is None: 333 new = el.text[:pos] + new_link + el.text[pos+len(link):] 334 el.text = new 335 else: 336 cur = el.attrib[attrib] 337 if not pos and len(cur) == len(link): 338 # Most common case 339 el.attrib[attrib] = new_link 340 else: 341 new = cur[:pos] + new_link + cur[pos+len(link):] 342 el.attrib[attrib] = new

343 344

345 -class _MethodFunc(object):

346 """ 347 An object that represents a method on an element as a function; 348 the function takes either an element or an HTML string. It 349 returns whatever the function normally returns, or if the function 350 works in-place (and so returns None) it returns a serialized form 351 of the resulting document. 352 """

353 - def __init__(self, name, copy=False, source_class=HtmlMixin):

354 self.name = name 355 self.copy = copy 356 self.__doc__ = getattr(source_class, self.name).__doc__

357 - def __call__(self, doc, *args, **kw):

358 if isinstance(doc, basestring): 359 if 'copy' in kw: 360 raise TypeError( 361 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 362 return_string = True 363 doc = fromstring(doc, **kw) 364 else: 365 if 'copy' in kw: 366 copy = kw.pop('copy') 367 else: 368 copy = self.copy 369 return_string = False 370 if copy: 371 doc = copy.deepcopy(doc) 372 meth = getattr(doc, self.name) 373 result = meth(*args, **kw) 374 # FIXME: this None test is a bit sloppy 375 if result is None: 376 # Then return what we got in 377 if return_string: 378 return tostring(doc) 379 else: 380 return doc 381 else: 382 return result

383 384 find_rel_links = _MethodFunc('find_rel_links', copy=False) 385 find_class = _MethodFunc('find_class', copy=False) 386 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 387 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 388 iterlinks = _MethodFunc('iterlinks', copy=False) 389 rewrite_links = _MethodFunc('rewrite_links', copy=True) 390

391 -class HtmlComment(etree.CommentBase, HtmlMixin):

392 pass

393

394 -class HtmlElement(etree.ElementBase, HtmlMixin):

395 pass

396

397 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):

398 pass

399

400 -class HtmlEntity(etree.EntityBase, HtmlMixin):

401 pass

402 403

404 -class HtmlElementClassLookup(etree.CustomElementClassLookup):

405 """A lookup scheme for HTML Element classes. 406 407 To create a lookup instance with different Element classes, pass a tag 408 name mapping of Element classes in the ``classes`` keyword argument and/or 409 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 410 The special key '*' denotes a Mixin class that should be mixed into all 411 Element classes. 412 """ 413 _default_element_classes = {} 414

415 - def __init__(self, classes=None, mixins=None):

416 etree.CustomElementClassLookup.__init__(self) 417 if classes is None: 418 classes = self._default_element_classes.copy() 419 if mixins: 420 mixers = {} 421 for name, value in mixins: 422 if name == '*': 423 for n in classes.keys(): 424 mixers.setdefault(n, []).append(value) 425 else: 426 mixers.setdefault(name, []).append(value) 427 for name, mix_bases in mixers.items(): 428 cur = classes.get(name, HtmlElement) 429 bases = tuple(mix_bases + [cur]) 430 classes[name] = type(cur.__name__, bases, {}) 431 self._element_classes = classes

432

433 - def lookup(self, node_type, document, namespace, name):

434 if node_type == 'element': 435 return self._element_classes.get(name.lower(), HtmlElement) 436 elif node_type == 'comment': 437 return HtmlComment 438 elif node_type == 'PI': 439 return HtmlProcessingInstruction 440 elif node_type == 'entity': 441 return HtmlEntity 442 # Otherwise normal lookup 443 return None

444 445 ################################################################################ 446 # parsing 447 ################################################################################ 448

449 -def document_fromstring(html, **kw):

450 value = etree.HTML(html, html_parser, **kw) 451 if value is None: 452 raise etree.ParserError( 453 "Document is empty") 454 return value

455

456 -def fragments_fromstring(html, no_leading_text=False, base_url=None, **kw):

457 """ 458 Parses several HTML elements, returning a list of elements. 459 460 The first item in the list may be a string (though leading 461 whitespace is removed). If no_leading_text is true, then it will 462 be an error if there is leading text, and it will always be a list 463 of only elements. 464 465 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 466 """ 467 # FIXME: check what happens when you give html with a body, head, etc. 468 start = html[:20].lstrip().lower() 469 if not start.startswith('<html') and not start.startswith('<!doctype'): 470 html = '<html><body>%s</body></html>' % html 471 doc = document_fromstring(html, base_url=base_url, **kw) 472 assert doc.tag == 'html' 473 bodies = [e for e in doc if e.tag == 'body'] 474 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 475 body = bodies[0] 476 elements = [] 477 if no_leading_text and body.text and body.text.strip(): 478 raise etree.ParserError( 479 "There is leading text: %r" % body.text) 480 if body.text and body.text.strip(): 481 elements.append(body.text) 482 elements.extend(body) 483 # FIXME: removing the reference to the parent artificial document 484 # would be nice 485 return elements

486

487 -def fragment_fromstring(html, create_parent=False, base_url=None, **kw):

488 """ 489 Parses a single HTML element; it is an error if there is more than 490 one element, or if anything but whitespace precedes or follows the 491 element. 492 493 If create_parent is true (or is a tag name) then a parent node 494 will be created to encapsulate the HTML in a single element. 495 496 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 497 """ 498 if create_parent: 499 if not isinstance(create_parent, basestring): 500 create_parent = 'div' 501 return fragment_fromstring('<%s>%s</%s>' % ( 502 create_parent, html, create_parent), base_url=base_url, **kw) 503 elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw) 504 if not elements: 505 raise etree.ParserError( 506 "No elements found") 507 if len(elements) > 1: 508 raise etree.ParserError( 509 "Multiple elements found (%s)" 510 % ', '.join([_element_name(e) for e in elements])) 511 el = elements[0] 512 if el.tail and el.tail.strip(): 513 raise etree.ParserError( 514 "Element followed by text: %r" % el.tail) 515 el.tail = None 516 return el

517

518 -def fromstring(html, base_url=None, **kw):

519 """ 520 Parse the html, returning a single element/document. 521 522 This tries to minimally parse the chunk of text, without knowing if it 523 is a fragment or a document. 524 525 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 526 """ 527 start = html[:10].lstrip().lower() 528 if start.startswith('<html') or start.startswith('<!doctype'): 529 # Looks like a full HTML document 530 return document_fromstring(html, base_url=base_url, **kw) 531 # otherwise, lets parse it out... 532 doc = document_fromstring(html, base_url=base_url, **kw) 533 bodies = doc.findall('body') 534 if bodies: 535 body = bodies[0] 536 if len(bodies) > 1: 537 # Somehow there are multiple bodies, which is bad, but just 538 # smash them into one body 539 for other_body in bodies[1:]: 540 if other_body.text: 541 if len(body): 542 body[-1].tail = (body[-1].tail or '') + other_body.text 543 else: 544 body.text = (body.text or '') + other_body.text 545 body.extend(other_body) 546 # We'll ignore tail 547 # I guess we are ignoring attributes too 548 other_body.drop_tree() 549 else: 550 body = None 551 heads = doc.findall('head') 552 if heads: 553 # Well, we have some sort of structure, so lets keep it all 554 head = heads[0] 555 if len(heads) > 1: 556 for other_head in heads[1:]: 557 head.extend(other_head) 558 # We don't care about text or tail in a head 559 other_head.drop_tree() 560 return doc 561 if (len(body) == 1 and (not body.text or not body.text.strip()) 562 and (not body[-1].tail or not body[-1].tail.strip())): 563 # The body has just one element, so it was probably a single 564 # element passed in 565 return body[0] 566 # Now we have a body which represents a bunch of tags which have the 567 # content that was passed in. We will create a fake container, which 568 # is the body tag, except <body> implies too much structure. 569 if _contains_block_level_tag(body): 570 body.tag = 'div' 571 else: 572 body.tag = 'span' 573 return body

574

575 -def parse(filename_or_url, parser=None, base_url=None, **kw):

576 """ 577 Parse a filename, URL, or file-like object into an HTML document 578 tree. Note: this returns a tree, not an element. Use 579 ``parse(...).getroot()`` to get the document root. 580 581 You can override the base URL with the ``base_url`` keyword. This 582 is most useful when parsing from a file-like object. 583 """ 584 if parser is None: 585 parser = html_parser 586 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)

587

588 -def _contains_block_level_tag(el):

589 # FIXME: I could do this with XPath, but would that just be 590 # unnecessarily slow? 591 for el in el.getiterator(): 592 if el.tag in defs.block_tags: 593 return True 594 return False

595

596 -def _element_name(el):

597 if isinstance(el, etree.CommentBase): 598 return 'comment' 599 elif isinstance(el, basestring): 600 return 'string' 601 else: 602 return el.tag

603 604 ################################################################################ 605 # form handling 606 ################################################################################ 607

608 -class FormElement(HtmlElement):

609 """ 610 Represents a <form> element. 611 """ 612

613 - def inputs(self):

614 """ 615 Returns an accessor for all the input elements in the form. 616 617 See `InputGetter` for more information about the object. 618 """ 619 return InputGetter(self)

620 inputs = property(inputs, doc=inputs.__doc__) 621

622 - def _fields__get(self):

623 """ 624 Dictionary-like object that represents all the fields in this 625 form. You can set values in this dictionary to effect the 626 form. 627 """ 628 return FieldsDict(self.inputs)

629 - def _fields__set(self, value):

630 prev_keys = self.fields.keys() 631 for key, value in value.iteritems(): 632 if key in prev_keys: 633 prev_keys.remove(key) 634 self.fields[key] = value 635 for key in prev_keys: 636 if key is None: 637 # Case of an unnamed input; these aren't really 638 # expressed in form_values() anyway. 639 continue 640 self.fields[key] = None

641 642 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 643

644 - def _name(self):

645 if self.get('name'): 646 return self.get('name') 647 elif self.get('id'): 648 return '#' + self.get('id') 649 return str(self.body.findall('form').index(self))

650

651 - def form_values(self):

652 """ 653 Return a list of tuples of the field values for the form. 654 This is suitable to be passed to ``urllib.urlencode()``. 655 """ 656 results = [] 657 for el in self.inputs: 658 name = el.name 659 if not name: 660 continue 661 if el.tag == 'textarea': 662 results.append((name, el.value)) 663 elif el.tag == 'select': 664 value = el.value 665 if el.multiple: 666 for v in value: 667 results.append((name, v)) 668 elif value is not None: 669 results.append((name, el.value)) 670 else: 671 assert el.tag == 'input', ( 672 "Unexpected tag: %r" % el) 673 if el.checkable and not el.checked: 674 continue 675 if el.type in ('submit', 'image', 'reset'): 676 continue 677 value = el.value 678 if value is not None: 679 results.append((name, el.value)) 680 return results

681

682 - def _action__get(self):

683 """ 684 Get/set the form's ``action`` attribute. 685 """ 686 base_url = self.base_url 687 action = self.get('action') 688 if base_url and action is not None: 689 return urlparse.urljoin(base_url, action) 690 else: 691 return action

692 - def _action__set(self, value):

693 self.set('action', value)

694 - def _action__del(self):

695 if 'action' in self.attrib: 696 del self.attrib['action']

697 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 698

699 - def _method__get(self):

700 """ 701 Get/set the form's method. Always returns a capitalized 702 string, and defaults to ``'GET'`` 703 """ 704 return self.get('method', 'GET').upper()

705 - def _method__set(self, value):

706 self.set('method', value.upper())

707 method = property(_method__get, _method__set, doc=_method__get.__doc__)

708 709 HtmlElementClassLookup._default_element_classes['form'] = FormElement 710

711 -def submit_form(form, extra_values=None, open_http=None):

712 """ 713 Helper function to submit a form. Returns a file-like object, as from 714 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 715 which shows the URL if there were any redirects. 716 717 You can use this like:: 718 719 form = doc.forms[0] 720 form.inputs['foo'].value = 'bar' # etc 721 response = form.submit() 722 doc = parse(response) 723 doc.make_links_absolute(response.geturl()) 724 725 To change the HTTP requester, pass a function as ``open_http`` keyword 726 argument that opens the URL for you. The function must have the following 727 signature:: 728 729 open_http(method, URL, values) 730 731 The action is one of 'GET' or 'POST', the URL is the target URL as a 732 string, and the values are a sequence of ``(name, value)`` tuples with the 733 form data. 734 """ 735 values = form.form_values() 736 if extra_values: 737 if hasattr(extra_values, 'items'): 738 extra_values = extra_values.items() 739 values.extend(extra_values) 740 if open_http is None: 741 open_http = open_http_urllib 742 return open_http(form.method, form.action, values)

743

744 -def open_http_urllib(method, url, values):

745 import urllib 746 ## FIXME: should test that it's not a relative URL or something 747 if method == 'GET': 748 if '?' in url: 749 url += '&' 750 else: 751 url += '?' 752 url += urllib.urlencode(values) 753 data = None 754 else: 755 data = urllib.urlencode(values) 756 return urllib.urlopen(url, data)

757

758 -class FieldsDict(DictMixin):

759

760 - def __init__(self, inputs):

761 self.inputs = inputs

762 - def __getitem__(self, item):

763 return self.inputs[item].value

764 - def __setitem__(self, item, value):

765 self.inputs[item].value = value

766 - def __delitem__(self, item):

767 raise KeyError( 768 "You cannot remove keys from ElementDict")

769 - def keys(self):

770 return self.inputs.keys()

771 - def __contains__(self, item):

772 return item in self.inputs

773

774 - def __repr__(self):

775 return '<%s for form %s>' % ( 776 self.__class__.__name__, 777 self.inputs.form._name())

778

779 -class InputGetter(object):

780 781 """ 782 An accessor that represents all the input fields in a form. 783 784 You can get fields by name from this, with 785 ``form.inputs['field_name']``. If there are a set of checkboxes 786 with the same name, they are returned as a list (a `CheckboxGroup` 787 which also allows value setting). Radio inputs are handled 788 similarly. 789 790 You can also iterate over this to get all input elements. This 791 won't return the same thing as if you get all the names, as 792 checkboxes and radio elements are returned individually. 793 """ 794 795 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 796 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 797

798 - def __init__(self, form):

799 self.form = form

800

801 - def __repr__(self):

802 return '<%s for form %s>' % ( 803 self.__class__.__name__, 804 self.form._name())

805 806 ## FIXME: there should be more methods, and it's unclear if this is 807 ## a dictionary-like object or list-like object 808

809 - def __getitem__(self, name):

810 results = self._name_xpath(self.form, name=name) 811 if results: 812 type = results[0].get('type') 813 if type == 'radio' and len(results) > 1: 814 group = RadioGroup(results) 815 group.name = name 816 return group 817 elif type == 'checkbox' and len(results) > 1: 818 group = CheckboxGroup(results) 819 group.name = name 820 return group 821 else: 822 # I don't like throwing away elements like this 823 return results[0] 824 else: 825 raise KeyError( 826 "No input element with the name %r" % name)

827

828 - def __contains__(self, name):

829 results = self._name_xpath(self.form, name=name) 830 return bool(results)

831

832 - def keys(self):

833 names = set() 834 for el in self: 835 if el.name is not None: 836 names.add(el.name) 837 return list(names)

838

839 - def __iter__(self):

840 ## FIXME: kind of dumb to turn a list into an iterator, only 841 ## to have it likely turned back into a list again :( 842 return iter(self._all_xpath(self.form))

843

844 -class InputMixin(object):

845 846 """ 847 Mix-in for all input elements (input, select, and textarea) 848 """ 849 850

851 - def _name__get(self):

852 """ 853 Get/set the name of the element 854 """ 855 return self.get('name')

856 - def _name__set(self, value):

857 self.set('name', value)

858 - def _name__del(self):

859 if 'name' in self.attrib: 860 del self.attrib['name']

861 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 862

863 - def __repr__(self):

864 type = getattr(self, 'type', None) 865 if type: 866 type = ' type=%r' % type 867 else: 868 type = '' 869 return '<%s %x name=%r%s>' % ( 870 self.__class__.__name__, id(self), self.name, type)

871

872 -class TextareaElement(InputMixin, HtmlElement):

873 """ 874 ``<textarea>`` element. You can get the name with ``.name`` and 875 get/set the value with ``.value`` 876 """ 877

878 - def _value__get(self):

879 """ 880 Get/set the value (which is the contents of this element) 881 """ 882 return self.text or ''

883 - def _value__set(self, value):

884 self.text = value

885 - def _value__del(self):

886 self.text = ''

887 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)

888 889 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 890

891 -class SelectElement(InputMixin, HtmlElement):

892 """ 893 ``<select>`` element. You can get the name with ``.name``. 894 895 ``.value`` will be the value of the selected option, unless this 896 is a multi-select element (``<select multiple>``), in which case 897 it will be a set-like object. In either case ``.value_options`` 898 gives the possible values. 899 900 The boolean attribute ``.multiple`` shows if this is a 901 multi-select. 902 """ 903

904 - def _value__get(self):

905 """ 906 Get/set the value of this select (the selected option). 907 908 If this is a multi-select, this is a set-like object that 909 represents all the selected options. 910 """ 911 if self.multiple: 912 return MultipleSelectOptions(self) 913 for el in self.getiterator('option'): 914 if 'selected' in el.attrib: 915 value = el.get('value') 916 # FIXME: If value is None, what to return?, get_text()? 917 return value 918 return None

919

920 - def _value__set(self, value):

921 if self.multiple: 922 if isinstance(value, basestring): 923 raise TypeError( 924 "You must pass in a sequence") 925 self.value.clear() 926 self.value.update(value) 927 return 928 if value is not None: 929 for el in self.getiterator('option'): 930 # FIXME: also if el.get('value') is None? 931 if el.get('value') == value: 932 checked_option = el 933 break 934 else: 935 raise ValueError( 936 "There is no option with the value of %r" % value) 937 for el in self.getiterator('option'): 938 if 'selected' in el.attrib: 939 del el.attrib['selected'] 940 if value is not None: 941 checked_option.set('selected', '')

942

943 - def _value__del(self):

944 # FIXME: should del be allowed at all? 945 if self.multiple: 946 self.value.clear() 947 else: 948 self.value = None

949 950 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 951

952 - def value_options(self):

953 """ 954 All the possible values this select can have (the ``value`` 955 attribute of all the ``<option>`` elements. 956 """ 957 return [el.get('value') for el in self.getiterator('option')]

958 value_options = property(value_options, doc=value_options.__doc__) 959

960 - def _multiple__get(self):

961 """ 962 Boolean attribute: is there a ``multiple`` attribute on this element. 963 """ 964 return 'multiple' in self.attrib

965 - def _multiple__set(self, value):

966 if value: 967 self.set('multiple', '') 968 elif 'multiple' in self.attrib: 969 del self.attrib['multiple']

970 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)

971 972 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 973

974 -class MultipleSelectOptions(SetMixin):

975 """ 976 Represents all the selected options in a ``<select multiple>`` element. 977 978 You can add to this set-like option to select an option, or remove 979 to unselect the option. 980 """ 981

982 - def __init__(self, select):

983 self.select = select

984

985 - def options(self):

986 """ 987 Iterator of all the ``<option>`` elements. 988 """ 989 return self.select.getiterator('option')

990 options = property(options) 991

992 - def __iter__(self):

993 for option in self.options: 994 yield option.get('value')

995

996 - def add(self, item):

997 for option in self.options: 998 if option.get('value') == item: 999 option.set('selected', '') 1000 break 1001 else: 1002 raise ValueError( 1003 "There is no option with the value %r" % item)

1004

1005 - def remove(self, item):

1006 for option in self.options: 1007 if option.get('value') == item: 1008 if 'selected' in option.attrib: 1009 del option.attrib['selected'] 1010 else: 1011 raise ValueError( 1012 "The option %r is not currently selected" % item) 1013 break 1014 else: 1015 raise ValueError( 1016 "There is not option with the value %r" % item)

1017

1018 - def __repr__(self):

1019 return '<%s {%s} for select name=%r>' % ( 1020 self.__class__.__name__, 1021 ', '.join([repr(v) for v in self]), 1022 self.select.name)

1023

1024 -class RadioGroup(list):

1025 """ 1026 This object represents several ``<input type=radio>`` elements 1027 that have the same name. 1028 1029 You can use this like a list, but also use the property 1030 ``.value`` to check/uncheck inputs. Also you can use 1031 ``.value_options`` to get the possible values. 1032 """ 1033

1034 - def _value__get(self):

1035 """ 1036 Get/set the value, which checks the radio with that value (and 1037 unchecks any other value). 1038 """ 1039 for el in self: 1040 if 'checked' in el.attrib: 1041 return el.get('value') 1042 return None

1043

1044 - def _value__set(self, value):

1045 if value is not None: 1046 for el in self: 1047 if el.get('value') == value: 1048 checked_option = el 1049 break 1050 else: 1051 raise ValueError( 1052 "There is no radio input with the value %r" % value) 1053 for el in self: 1054 if 'checked' in el.attrib: 1055 del el.attrib['checked'] 1056 if value is not None: 1057 checked_option.set('checked', '')

1058

1059 - def _value__del(self):

1060 self.value = None

1061 1062 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1063

1064 - def value_options(self):

1065 """ 1066 Returns a list of all the possible values. 1067 """ 1068 return [el.get('value') for el in self]

1069 value_options = property(value_options, doc=value_options.__doc__) 1070

1071 - def __repr__(self):

1072 return '%s(%s)' % ( 1073 self.__class__.__name__, 1074 list.__repr__(self))

1075

1076 -class CheckboxGroup(list):

1077 """ 1078 Represents a group of checkboxes (``<input type=checkbox>``) that 1079 have the same name. 1080 1081 In addition to using this like a list, the ``.value`` attribute 1082 returns a set-like object that you can add to or remove from to 1083 check and uncheck checkboxes. You can also use ``.value_options`` 1084 to get the possible values. 1085 """ 1086

1087 - def _value__get(self):

1088 """ 1089 Return a set-like object that can be modified to check or 1090 uncheck individual checkboxes according to their value. 1091 """ 1092 return CheckboxValues(self)

1093 - def _value__set(self, value):

1094 self.value.clear() 1095 if not hasattr(value, '__iter__'): 1096 raise ValueError( 1097 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1098 % (self[0].name, value)) 1099 self.value.update(value)

1100 - def _value__del(self):

1101 self.value.clear()

1102 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1103

1104 - def __repr__(self):

1105 return '%s(%s)' % ( 1106 self.__class__.__name__, list.__repr__(self))

1107

1108 -class CheckboxValues(SetMixin):

1109 1110 """ 1111 Represents the values of the checked checkboxes in a group of 1112 checkboxes with the same name. 1113 """ 1114

1115 - def __init__(self, group):

1116 self.group = group

1117

1118 - def __iter__(self):

1119 return iter([ 1120 el.get('value') 1121 for el in self.group 1122 if 'checked' in el.attrib])

1123

1124 - def add(self, value):

1125 for el in self.group: 1126 if el.get('value') == value: 1127 el.set('checked', '') 1128 break 1129 else: 1130 raise KeyError("No checkbox with value %r" % value)

1131

1132 - def remove(self, value):

1133 for el in self.group: 1134 if el.get('value') == value: 1135 if 'checked' in el.attrib: 1136 del el.attrib['checked'] 1137 else: 1138 raise KeyError( 1139 "The checkbox with value %r was already unchecked" % value) 1140 break 1141 else: 1142 raise KeyError( 1143 "No checkbox with value %r" % value)

1144

1145 - def __repr__(self):

1146 return '<%s {%s} for checkboxes name=%r>' % ( 1147 self.__class__.__name__, 1148 ', '.join([repr(v) for v in self]), 1149 self.group.name)

1150

1151 -class InputElement(InputMixin, HtmlElement):

1152 """ 1153 Represents an ``<input>`` element. 1154 1155 You can get the type with ``.type`` (which is lower-cased and 1156 defaults to ``'text'``). 1157 1158 Also you can get and set the value with ``.value`` 1159 1160 Checkboxes and radios have the attribute ``input.checkable == 1161 True`` (for all others it is false) and a boolean attribute 1162 ``.checked``. 1163 1164 """ 1165 1166 ## FIXME: I'm a little uncomfortable with the use of .checked

1167 - def _value__get(self):

1168 """ 1169 Get/set the value of this element, using the ``value`` attribute. 1170 1171 Also, if this is a checkbox and it has no value, this defaults 1172 to ``'on'``. If it is a checkbox or radio that is not 1173 checked, this returns None. 1174 """ 1175 if self.checkable: 1176 if self.checked: 1177 return self.get('value') or 'on' 1178 else: 1179 return None 1180 return self.get('value')

1181 - def _value__set(self, value):

1182 if self.checkable: 1183 if not value: 1184 self.checked = False 1185 else: 1186 self.checked = True 1187 if isinstance(value, basestring): 1188 self.set('value', value) 1189 else: 1190 self.set('value', value)

1191 - def _value__del(self):

1192 if self.checkable: 1193 self.checked = False 1194 else: 1195 if 'value' in self.attrib: 1196 del self.attrib['value']

1197 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1198

1199 - def _type__get(self):

1200 """ 1201 Return the type of this element (using the type attribute). 1202 """ 1203 return self.get('type', 'text').lower()

1204 - def _type__set(self, value):

1205 self.set('type', value)

1206 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1207

1208 - def checkable(self):

1209 """ 1210 Boolean: can this element be checked? 1211 """ 1212 return self.type in ['checkbox', 'radio']

1213 checkable = property(checkable, doc=checkable.__doc__) 1214

1215 - def _checked__get(self):

1216 """ 1217 Boolean attribute to get/set the presence of the ``checked`` 1218 attribute. 1219 1220 You can only use this on checkable input types. 1221 """ 1222 if not self.checkable: 1223 raise AttributeError('Not a checkable input type') 1224 return 'checked' in self.attrib

1225 - def _checked__set(self, value):

1226 if not self.checkable: 1227 raise AttributeError('Not a checkable input type') 1228 if value: 1229 self.set('checked', '') 1230 else: 1231 if 'checked' in self.attrib: 1232 del self.attrib['checked']

1233 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)

1234 1235 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1236

1237 -class LabelElement(HtmlElement):

1238 """ 1239 Represents a ``<label>`` element. 1240 1241 Label elements are linked to other elements with their ``for`` 1242 attribute. You can access this element with ``label.for_element``. 1243 """ 1244

1245 - def _for_element__get(self):

1246 """ 1247 Get/set the element this label points to. Return None if it 1248 can't be found. 1249 """ 1250 id = self.get('for') 1251 if not id: 1252 return None 1253 return self.body.get_element_by_id(id)

1254 - def _for_element__set(self, other):

1255 id = other.get('id') 1256 if not id: 1257 raise TypeError( 1258 "Element %r has no id attribute" % other) 1259 self.set('for', id)

1260 - def _for_element__del(self):

1261 if 'id' in self.attrib: 1262 del self.attrib['id']

1263 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1264 doc=_for_element__get.__doc__)

1265 1266 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1267 1268 ############################################################ 1269 ## Serialization 1270 ############################################################ 1271 1272 # This isn't a general match, but it's a match for what libxml2 1273 # specifically serialises: 1274 __replace_meta_content_type = re.compile( 1275 r'<meta http-equiv="Content-Type".*?>').sub 1276

1277 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1278 encoding=None, method="html"):

1279 """Return an HTML string representation of the document. 1280 1281 Note: if include_meta_content_type is true this will create a 1282 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1283 regardless of the value of include_meta_content_type any existing 1284 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1285 1286 The ``encoding`` argument controls the output encoding (defauts to 1287 ASCII, with &#...; character references for any characters outside 1288 of ASCII). 1289 1290 The ``method`` argument defines the output method. It defaults to 1291 'html', but can also be 'xml' for xhtml output, or 'text' to 1292 serialise to plain text without markup. Note that you can pass 1293 the builtin ``unicode`` type as ``encoding`` argument to serialise 1294 to a unicode string. 1295 1296 Example:: 1297 1298 >>> from lxml import html 1299 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1300 1301 >>> html.tostring(root) 1302 '<p>Hello<br>world!</p>' 1303 >>> html.tostring(root, method='html') 1304 '<p>Hello<br>world!</p>' 1305 1306 >>> html.tostring(root, method='xml') 1307 '<p>Hello<br/>world!</p>' 1308 1309 >>> html.tostring(root, method='text') 1310 'Helloworld!' 1311 1312 >>> html.tostring(root, method='text', encoding=unicode) 1313 u'Helloworld!' 1314 """ 1315 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1316 encoding=encoding) 1317 if not include_meta_content_type: 1318 html = __replace_meta_content_type('', html) 1319 return html

1320

1321 -def open_in_browser(doc):

1322 """ 1323 Open the HTML document in a web browser (saving it to a temporary 1324 file to open it). 1325 """ 1326 import os 1327 import webbrowser 1328 try: 1329 write_doc = doc.write 1330 except AttributeError: 1331 write_doc = etree.ElementTree(element=doc).write 1332 fn = os.tempnam() + '.html' 1333 write_doc(fn, method="html") 1334 url = 'file://' + fn.replace(os.path.sep, '/') 1335 print url 1336 webbrowser.open(url)

1337 1338 ################################################################################ 1339 # configure Element class lookup 1340 ################################################################################ 1341

1342 -class HTMLParser(etree.HTMLParser):

1343 - def __init__(self, **kwargs):

1344 super(HTMLParser, self).__init__(**kwargs) 1345 self.setElementClassLookup(HtmlElementClassLookup())

1346

1347 -def Element(*args, **kw):

1348 v = html_parser.makeelement(*args, **kw) 1349 return v

1350 1351 html_parser = HTMLParser() 1352

Source Code for Package lxml.html