1 """CSS Selectors based on XPath.
2
3 This module supports selecting XML/HTML tags based on CSS selectors.
4 See the `CSSSelector` class for details.
5 """
6
7 import re
8 from lxml import etree
9
10 __all__ = ['SelectorSyntaxError', 'ExpressionError',
11 'CSSSelector']
12
15
18
20 """A CSS selector.
21
22 Usage::
23
24 >>> from lxml import etree, cssselect
25 >>> select = cssselect.CSSSelector("a tag > child")
26
27 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
28 >>> [ el.tag for el in select(root) ]
29 ['child']
30 """
35
37 return '<%s %s for %r>' % (
38 self.__class__.__name__,
39 hex(abs(id(self)))[2:],
40 self.css)
41
42
43
44
47 obj = unicode.__new__(cls, contents)
48 obj.pos = pos
49 return obj
50
52 return '%s(%s, %r)' % (
53 self.__class__.__name__,
54 unicode.__repr__(self),
55 self.pos)
56
59
62
65
66
67
68
69
70
71
72
74 """
75 Represents selector.class_name
76 """
77
78 - def __init__(self, selector, class_name):
79 self.selector = selector
80 self.class_name = class_name
81
83 return '%s[%r.%s]' % (
84 self.__class__.__name__,
85 self.selector,
86 self.class_name)
87
89 sel_xpath = self.selector.xpath()
90 sel_xpath.add_condition(
91 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' '))
92 return sel_xpath
93
95 """
96 Represents selector:name(expr)
97 """
98
99 unsupported = [
100 'target', 'lang', 'enabled', 'disabled',]
101
102 - def __init__(self, selector, type, name, expr):
103 self.selector = selector
104 self.type = type
105 self.name = name
106 self.expr = expr
107
109 return '%s[%r%s%s(%r)]' % (
110 self.__class__.__name__,
111 self.selector,
112 self.type, self.name, self.expr)
113
115 sel_path = self.selector.xpath()
116 if self.name in self.unsupported:
117 raise ExpressionError(
118 "The psuedo-class %r is not supported" % self.name)
119 method = '_xpath_' + self.name.replace('-', '_')
120 if not hasattr(self, method):
121 raise ExpressionError(
122 "The psuedo-class %r is unknown" % self.name)
123 method = getattr(self, method)
124 return method(sel_path, self.expr)
125
128 a, b = parse_series(expr)
129 if not a and not b and not last:
130
131 xpath.add_condition('false() and position() = 0')
132 return xpath
133 if add_name_test:
134 xpath.add_name_test()
135 xpath.add_star_prefix()
136 if a == 0:
137 if last:
138 b = 'last() - %s' % b
139 xpath.add_condition('position() = %s' % b)
140 return xpath
141 if last:
142
143 a = -a
144 b = -b
145 if b > 0:
146 b_neg = str(-b)
147 else:
148 b_neg = '+%s' % (-b)
149 if a != 1:
150 expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
151 else:
152 expr = []
153 if b >= 0:
154 expr.append('position() >= %s' % b)
155 elif b < 0 and last:
156 expr.append('position() < (last() %s)' % b)
157 expr = ' and '.join(expr)
158 if expr:
159 xpath.add_condition(expr)
160 return xpath
161
162
163
164
165
166
167
168
170 return self._xpath_nth_child(xpath, expr, last=True)
171
173 if xpath.element == '*':
174 raise NotImplementedError(
175 "*:nth-of-type() is not implemented")
176 return self._xpath_nth_child(xpath, expr, add_name_test=False)
177
179 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
180
182
183 if isinstance(expr, Element):
184 expr = expr._format_element()
185 xpath.add_condition('contains(css:lower-case(string(.)), %s)'
186 % xpath_repr(expr.lower()))
187
188 return xpath
189
191
192 expr = expr.xpath()
193 cond = expr.condition
194
195 xpath.add_condition('not(%s)' % cond)
196 return xpath
197
200
201 ns = etree.FunctionNamespace('/css/')
202 ns.prefix = 'css'
203 ns['lower-case'] = _make_lower_case
204
206 """
207 Represents selector:ident
208 """
209
210 unsupported = ['indeterminate', 'first-line', 'first-letter',
211 'selection', 'before', 'after', 'link', 'visited',
212 'active', 'focus', 'hover']
213
214 - def __init__(self, element, type, ident):
215 self.element = element
216 assert type in (':', '::')
217 self.type = type
218 self.ident = ident
219
221 return '%s[%r%s%s]' % (
222 self.__class__.__name__,
223 self.element,
224 self.type, self.ident)
225
227 el_xpath = self.element.xpath()
228 if self.ident in self.unsupported:
229 raise ExpressionError(
230 "The psuedo-class %r is unsupported" % self.ident)
231 method = '_xpath_' + self.ident.replace('-', '_')
232 if not hasattr(self, method):
233 raise ExpressionError(
234 "The psuedo-class %r is unknown" % self.ident)
235 method = getattr(self, method)
236 el_xpath = method(el_xpath)
237 return el_xpath
238
240
241 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
242 return xpath
243
245
246 raise NotImplementedError
247
249 xpath.add_star_prefix()
250 xpath.add_name_test()
251 xpath.add_condition('position() = 1')
252 return xpath
253
255 xpath.add_star_prefix()
256 xpath.add_name_test()
257 xpath.add_condition('position() = last()')
258 return xpath
259
261 if xpath.element == '*':
262 raise NotImplementedError(
263 "*:first-of-type is not implemented")
264 xpath.add_star_prefix()
265 xpath.add_condition('position() = 1')
266 return xpath
267
269 if xpath.element == '*':
270 raise NotImplementedError(
271 "*:last-of-type is not implemented")
272 xpath.add_star_prefix()
273 xpath.add_condition('position() = last()')
274 return xpath
275
277 xpath.add_name_test()
278 xpath.add_star_prefix()
279 xpath.add_condition('last() = 1')
280 return xpath
281
283 if xpath.element == '*':
284 raise NotImplementedError(
285 "*:only-of-type is not implemented")
286 xpath.add_condition('last() = 1')
287 return xpath
288
290 xpath.add_condition("not(*) and not(normalize-space())")
291 return xpath
292
294 """
295 Represents selector[namespace|attrib operator value]
296 """
297
298 - def __init__(self, selector, namespace, attrib, operator, value):
299 self.selector = selector
300 self.namespace = namespace
301 self.attrib = attrib
302 self.operator = operator
303 self.value = value
304
306 if self.operator == 'exists':
307 return '%s[%r[%s]]' % (
308 self.__class__.__name__,
309 self.selector,
310 self._format_attrib())
311 else:
312 return '%s[%r[%s %s %r]]' % (
313 self.__class__.__name__,
314 self.selector,
315 self._format_attrib(),
316 self.operator,
317 self.value)
318
324
326
327 if self.namespace == '*':
328 return '@' + self.attrib
329 else:
330 return '@%s:%s' % (self.namespace, self.attrib)
331
333 path = self.selector.xpath()
334 attrib = self._xpath_attrib()
335 value = self.value
336 if self.operator == 'exists':
337 assert not value
338 path.add_condition(attrib)
339 elif self.operator == '=':
340 path.add_condition('%s = %s' % (attrib,
341 xpath_repr(value)))
342 elif self.operator == '!=':
343
344 if value:
345 path.add_condition('not(%s) or %s != %s'
346 % (attrib, attrib, xpath_repr(value)))
347 else:
348 path.add_condition('%s != %s'
349 % (attrib, xpath_repr(value)))
350
351 elif self.operator == '~=':
352 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' ')))
353 elif self.operator == '|=':
354
355 path.add_condition('%s = %s or starts-with(%s, %s)' % (
356 attrib, xpath_repr(value),
357 attrib, xpath_repr(value + '-')))
358 elif self.operator == '^=':
359 path.add_condition('starts-with(%s, %s)' % (
360 attrib, xpath_repr(value)))
361 elif self.operator == '$=':
362
363 path.add_condition('substring(%s, string-length(%s)-%s) = %s'
364 % (attrib, attrib, len(value)-1, xpath_repr(value)))
365 elif self.operator == '*=':
366
367 path.add_condition('contains(%s, %s)' % (
368 attrib, xpath_repr(value)))
369 else:
370 assert 0, ("Unknown operator: %r" % self.operator)
371 return path
372
374 """
375 Represents namespace|element
376 """
377
378 - def __init__(self, namespace, element):
379 self.namespace = namespace
380 self.element = element
381
383 return '%s[%s]' % (
384 self.__class__.__name__,
385 self._format_element())
386
392
394 if self.namespace == '*':
395 el = self.element.lower()
396 else:
397
398 el = '%s:%s' % (self.namespace, self.element)
399 return XPathExpr(element=el)
400
402 """
403 Represents selector#id
404 """
405
407 self.selector = selector
408 self.id = id
409
411 return '%s[%r#%s]' % (
412 self.__class__.__name__,
413 self.selector, self.id)
414
416 path = self.selector.xpath()
417 path.add_condition('@id = %s' % xpath_repr(self.id))
418 return path
419
421
425 return '%s(%r)' % (
426 self.__class__.__name__,
427 self.items)
428
430 paths = [item.xpath() for item in self.items]
431 return XPathExprOr(paths)
432
434
435 _method_mapping = {
436 ' ': 'descendant',
437 '>': 'child',
438 '+': 'direct_adjacent',
439 '~': 'indirect_adjacent',
440 }
441
442 - def __init__(self, selector, combinator, subselector):
443 assert selector is not None
444 self.selector = selector
445 self.combinator = combinator
446 self.subselector = subselector
447
449 if self.combinator == ' ':
450 comb = '<followed>'
451 else:
452 comb = self.combinator
453 return '%s[%r %s %r]' % (
454 self.__class__.__name__,
455 self.selector,
456 comb,
457 self.subselector)
458
460 if self.combinator not in self._method_mapping:
461 raise ExpressionError(
462 "Unknown combinator: %r" % self.combinator)
463 method = '_xpath_' + self._method_mapping[self.combinator]
464 method = getattr(self, method)
465 path = self.selector.xpath()
466 return method(path, self.subselector)
467
472
477
479
480 xpath.join('/following-sibling::', sub.xpath())
481 xpath.add_name_test()
482 xpath.add_condition('position() = 1')
483 return xpath
484
489
490
491
492
493 _el_re = re.compile(r'^\w+\s*$')
494 _id_re = re.compile(r'^(\w*)#(\w+)\s*$')
495 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$')
496
498 if isinstance(css_expr, basestring):
499 match = _el_re.search(css_expr)
500 if match is not None:
501 return '%s%s' % (prefix, match.group(0).strip())
502 match = _id_re.search(css_expr)
503 if match is not None:
504 return "%s%s[@id = '%s']" % (
505 prefix, match.group(1) or '*', match.group(2))
506 match = _class_re.search(css_expr)
507 if match is not None:
508 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
509 prefix, match.group(1) or '*', match.group(2))
510 css_expr = parse(css_expr)
511 expr = css_expr.xpath()
512 assert expr is not None, (
513 "Got None for xpath expression from %s" % repr(css_expr))
514 if prefix:
515 expr.add_prefix(prefix)
516 return str(expr)
517
519
520 - def __init__(self, prefix=None, path=None, element='*', condition=None,
521 star_prefix=False):
522 self.prefix = prefix
523 self.path = path
524 self.element = element
525 self.condition = condition
526 self.star_prefix = star_prefix
527
529 path = ''
530 if self.prefix is not None:
531 path += str(self.prefix)
532 if self.path is not None:
533 path += str(self.path)
534 path += str(self.element)
535 if self.condition:
536 path += '[%s]' % self.condition
537 return path
538
540 return '%s[%s]' % (
541 self.__class__.__name__, self)
542
544 if self.condition:
545 self.condition = '%s and (%s)' % (self.condition, condition)
546 else:
547 self.condition = condition
548
550 if self.path is None:
551 self.path = self.element
552 else:
553 self.path += self.element
554 self.element = part
555
557 if self.prefix:
558 self.prefix = prefix + self.prefix
559 else:
560 self.prefix = prefix
561
563 if self.element == '*':
564
565 return
566 self.add_condition("name() = %s" % xpath_repr(self.element))
567 self.element = '*'
568
570 """
571 Adds a /* prefix if there is no prefix. This is when you need
572 to keep context's constrained to a single parent.
573 """
574 if self.path:
575 self.path += '*/'
576 else:
577 self.path = '*/'
578 self.star_prefix = True
579
580 - def join(self, combiner, other):
581 prefix = str(self)
582 prefix += combiner
583 path = (other.prefix or '') + (other.path or '')
584
585
586 if other.star_prefix and path == '*/':
587 path = ''
588 self.prefix = prefix
589 self.path = path
590 self.element = other.element
591 self.condition = other.condition
592
594 """
595 Represents |'d expressions. Note that unfortunately it isn't
596 the union, it's the sum, so duplicate elements will appear.
597 """
598
599 - def __init__(self, items, prefix=None):
600 for item in items:
601 assert item is not None
602 self.items = items
603 self.prefix = prefix
604
606 prefix = self.prefix or ''
607 return ' | '.join([prefix + str(i) for i in self.items])
608
610
611
612
613 if isinstance(s, Element):
614
615 s = s._format_element()
616 return repr(str(s))
617
618
619
620
622 stream = TokenStream(tokenize(string))
623 stream.source = string
624 try:
625 return parse_selector_group(stream)
626 except SelectorSyntaxError, e:
627 e.args = tuple(["%s at %s -> %s" % (
628 e, stream.used, list(stream))])
629 raise
630
632 result = []
633 while 1:
634 result.append(parse_selector(stream))
635 if stream.peek() == ',':
636 stream.next()
637 else:
638 break
639 if len(result) == 1:
640 return result[0]
641 else:
642 return Or(result)
643
645 result = parse_simple_selector(stream)
646 while 1:
647 peek = stream.peek()
648 if peek == ',' or peek is None:
649 return result
650 elif peek in ('+', '>', '~'):
651
652 combinator = stream.next()
653 else:
654 combinator = ' '
655 next_selector = parse_simple_selector(stream)
656 result = CombinedSelector(result, combinator, next_selector)
657 return result
658
660 peek = stream.peek()
661 if peek != '*' and not isinstance(peek, Symbol):
662 element = namespace = '*'
663 else:
664 next = stream.next()
665 if next != '*' and not isinstance(next, Symbol):
666 raise SelectorSyntaxError(
667 "Expected symbol, got %r" % next)
668 if stream.peek() == '|':
669 namespace = next
670 stream.next()
671 element = stream.next()
672 if element != '*' and not isinstance(next, Symbol):
673 raise SelectorSyntaxError(
674 "Expected symbol, got %r" % next)
675 else:
676 namespace = '*'
677 element = next
678 result = Element(namespace, element)
679 has_hash = False
680 while 1:
681 peek = stream.peek()
682 if peek == '#':
683 if has_hash:
684
685
686 break
687 stream.next()
688 result = Hash(result, stream.next())
689 has_hash = True
690 continue
691 elif peek == '.':
692 stream.next()
693 result = Class(result, stream.next())
694 continue
695 elif peek == '[':
696 stream.next()
697 result = parse_attrib(result, stream)
698 next = stream.next()
699 if not next == ']':
700 raise SelectorSyntaxError(
701 "] expected, got %r" % next)
702 continue
703 elif peek == ':' or peek == '::':
704 type = stream.next()
705 ident = stream.next()
706 if not isinstance(ident, Symbol):
707 raise SelectorSyntaxError(
708 "Expected symbol, got %r" % ident)
709 if stream.peek() == '(':
710 stream.next()
711 peek = stream.peek()
712 if isinstance(peek, String):
713 selector = stream.next()
714 elif isinstance(peek, Symbol) and is_int(peek):
715 selector = int(stream.next())
716 else:
717
718 selector = parse_simple_selector(stream)
719 next = stream.next()
720 if not next == ')':
721 raise SelectorSyntaxError(
722 "Expected ), got %r and %r"
723 % (next, selector))
724 result = Function(result, type, ident, selector)
725 else:
726 result = Pseudo(result, type, ident)
727 continue
728 else:
729 if peek == ' ':
730 stream.next()
731 break
732
733 return result
734
736 try:
737 int(v)
738 except ValueError:
739 return False
740 else:
741 return True
742
744 attrib = stream.next()
745 if stream.peek() == '|':
746 namespace = attrib
747 stream.next()
748 attrib = stream.next()
749 else:
750 namespace = '*'
751 if stream.peek() == ']':
752 return Attrib(selector, namespace, attrib, 'exists', None)
753 op = stream.next()
754 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
755 raise SelectorSyntaxError(
756 "Operator expected, got %r" % op)
757 value = stream.next()
758 if not isinstance(value, (Symbol, String)):
759 raise SelectorSyntaxError(
760 "Expected string or symbol, got %r" % value)
761 return Attrib(selector, namespace, attrib, op, value)
762
764 """
765 Parses things like '1n+2', or 'an+b' generally, returning (a, b)
766 """
767 if isinstance(s, Element):
768 s = s._format_element()
769 if not s or s == '*':
770
771 return (0, 0)
772 if isinstance(s, int):
773
774 return (0, s)
775 if s == 'odd':
776 return (2, 1)
777 elif s == 'even':
778 return (2, 0)
779 elif s == 'n':
780 return (1, 0)
781 if 'n' not in s:
782
783 return (0, int(s))
784 a, b = s.split('n', 1)
785 if not a:
786 a = 1
787 elif a == '-' or a == '+':
788 a = int(a+'1')
789 else:
790 a = int(a)
791 if not b:
792 b = 0
793 elif b == '-' or b == '+':
794 b = int(b+'1')
795 else:
796 b = int(b)
797 return (a, b)
798
799
800
801
802
803
804 _whitespace_re = re.compile(r'\s+')
805
806 _comment_re = re.compile(r'/\*.*?\*/', re.S)
807
808 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
809
811 pos = 0
812 s = _comment_re.sub('', s)
813 while 1:
814 match = _whitespace_re.match(s, pos=pos)
815 if match:
816 preceding_whitespace_pos = pos
817 pos = match.end()
818 else:
819 preceding_whitespace_pos = 0
820 if pos >= len(s):
821 return
822 match = _count_re.match(s, pos=pos)
823 if match and match.group() != 'n':
824 sym = s[pos:match.end()]
825 yield Symbol(sym, pos)
826 pos = match.end()
827 continue
828 c = s[pos]
829 c2 = s[pos:pos+2]
830 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
831 yield Token(c2, pos)
832 pos += 2
833 continue
834 if c in '>+~,.*=[]()|:#':
835 if c in '.#' and preceding_whitespace_pos > 0:
836 yield Token(' ', preceding_whitespace_pos)
837 yield Token(c, pos)
838 pos += 1
839 continue
840 if c == '"' or c == "'":
841
842 old_pos = pos
843 sym, pos = tokenize_escaped_string(s, pos)
844 yield String(sym, old_pos)
845 continue
846 old_pos = pos
847 sym, pos = tokenize_symbol(s, pos)
848 yield Symbol(sym, old_pos)
849 continue
850
852 quote = s[pos]
853 assert quote in ('"', "'")
854 pos = pos+1
855 start = pos
856 while 1:
857 next = s.find(quote, pos)
858 if next == -1:
859 raise SelectorSyntaxError(
860 "Expected closing %s for string in: %r"
861 % (quote, s[start:]))
862 result = s[start:next]
863 try:
864 result = result.decode('unicode_escape')
865 except UnicodeDecodeError:
866
867 pos = next+1
868 else:
869 return result, next+1
870
871 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
872
874 start = pos
875 match = _illegal_symbol.search(s, pos=pos)
876 if not match:
877
878 return s[start:], len(s)
879 if match.start() == pos:
880 assert 0, (
881 "Unexpected symbol: %r at %s" % (s[pos], pos))
882 if not match:
883 result = s[start:]
884 pos = len(s)
885 else:
886 result = s[start:match.start()]
887 pos = match.start()
888 try:
889 result = result.decode('unicode_escape')
890 except UnicodeDecodeError, e:
891 raise SelectorSyntaxError(
892 "Bad symbol %r: %s" % (result, e))
893 return result, pos
894
896
897 - def __init__(self, tokens, source=None):
898 self.used = []
899 self.tokens = iter(tokens)
900 self.source = source
901 self.peeked = None
902 self._peeking = False
903
905 if self._peeking:
906 self._peeking = False
907 self.used.append(self.peeked)
908 return self.peeked
909 else:
910 try:
911 next = self.tokens.next()
912 self.used.append(next)
913 return next
914 except StopIteration:
915 return None
916
918 return iter(self.next, None)
919
921 if not self._peeking:
922 try:
923 self.peeked = self.tokens.next()
924 except StopIteration:
925 return None
926 self._peeking = True
927 return self.peeked
928