1 """CSS Selectors based on XPath.
2
3 This module supports selecting XML/HTML tags based on CSS selectors.
4 See the `CSSSelector` class for details.
5 """
6
7 import re
8 from lxml import etree
9
10 __all__ = ['SelectorSyntaxError', 'ExpressionError',
11 'CSSSelector']
12
13 try:
14 _basestring = basestring
15 except NameError:
16 _basestring = str
17
20
23
25 """A CSS selector.
26
27 Usage::
28
29 >>> from lxml import etree, cssselect
30 >>> select = cssselect.CSSSelector("a tag > child")
31
32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
33 >>> [ el.tag for el in select(root) ]
34 ['child']
35 """
40
42 return '<%s %s for %r>' % (
43 self.__class__.__name__,
44 hex(abs(id(self)))[2:],
45 self.css)
46
47
48
49
50 try:
51 _unicode = unicode
52 except NameError:
53
54 _unicode = str
55
58 obj = _unicode.__new__(cls, contents)
59 obj.pos = pos
60 return obj
61
63 return '%s(%s, %r)' % (
64 self.__class__.__name__,
65 _unicode.__repr__(self),
66 self.pos)
67
70
73
76
77
78
79
80
81
82
83
85 """
86 Represents selector.class_name
87 """
88
89 - def __init__(self, selector, class_name):
90 self.selector = selector
91 self.class_name = class_name
92
94 return '%s[%r.%s]' % (
95 self.__class__.__name__,
96 self.selector,
97 self.class_name)
98
100 sel_xpath = self.selector.xpath()
101 sel_xpath.add_condition(
102 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' '))
103 return sel_xpath
104
106 """
107 Represents selector:name(expr)
108 """
109
110 unsupported = [
111 'target', 'lang', 'enabled', 'disabled',]
112
113 - def __init__(self, selector, type, name, expr):
114 self.selector = selector
115 self.type = type
116 self.name = name
117 self.expr = expr
118
120 return '%s[%r%s%s(%r)]' % (
121 self.__class__.__name__,
122 self.selector,
123 self.type, self.name, self.expr)
124
136
139 a, b = parse_series(expr)
140 if not a and not b and not last:
141
142 xpath.add_condition('false() and position() = 0')
143 return xpath
144 if add_name_test:
145 xpath.add_name_test()
146 xpath.add_star_prefix()
147 if a == 0:
148 if last:
149 b = 'last() - %s' % b
150 xpath.add_condition('position() = %s' % b)
151 return xpath
152 if last:
153
154 a = -a
155 b = -b
156 if b > 0:
157 b_neg = str(-b)
158 else:
159 b_neg = '+%s' % (-b)
160 if a != 1:
161 expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
162 else:
163 expr = []
164 if b >= 0:
165 expr.append('position() >= %s' % b)
166 elif b < 0 and last:
167 expr.append('position() < (last() %s)' % b)
168 expr = ' and '.join(expr)
169 if expr:
170 xpath.add_condition(expr)
171 return xpath
172
173
174
175
176
177
178
179
182
188
191
200
208
211
212 ns = etree.FunctionNamespace('/css/')
213 ns.prefix = 'css'
214 ns['lower-case'] = _make_lower_case
215
217 """
218 Represents selector:ident
219 """
220
221 unsupported = ['indeterminate', 'first-line', 'first-letter',
222 'selection', 'before', 'after', 'link', 'visited',
223 'active', 'focus', 'hover']
224
225 - def __init__(self, element, type, ident):
226 self.element = element
227 assert type in (':', '::')
228 self.type = type
229 self.ident = ident
230
232 return '%s[%r%s%s]' % (
233 self.__class__.__name__,
234 self.element,
235 self.type, self.ident)
236
249
251
252 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
253 return xpath
254
256
257 raise NotImplementedError
258
264
270
278
286
292
294 if xpath.element == '*':
295 raise NotImplementedError(
296 "*:only-of-type is not implemented")
297 xpath.add_condition('last() = 1')
298 return xpath
299
303
305 """
306 Represents selector[namespace|attrib operator value]
307 """
308
309 - def __init__(self, selector, namespace, attrib, operator, value):
310 self.selector = selector
311 self.namespace = namespace
312 self.attrib = attrib
313 self.operator = operator
314 self.value = value
315
317 if self.operator == 'exists':
318 return '%s[%r[%s]]' % (
319 self.__class__.__name__,
320 self.selector,
321 self._format_attrib())
322 else:
323 return '%s[%r[%s %s %r]]' % (
324 self.__class__.__name__,
325 self.selector,
326 self._format_attrib(),
327 self.operator,
328 self.value)
329
335
337
338 if self.namespace == '*':
339 return '@' + self.attrib
340 else:
341 return '@%s:%s' % (self.namespace, self.attrib)
342
344 path = self.selector.xpath()
345 attrib = self._xpath_attrib()
346 value = self.value
347 if self.operator == 'exists':
348 assert not value
349 path.add_condition(attrib)
350 elif self.operator == '=':
351 path.add_condition('%s = %s' % (attrib,
352 xpath_repr(value)))
353 elif self.operator == '!=':
354
355 if value:
356 path.add_condition('not(%s) or %s != %s'
357 % (attrib, attrib, xpath_repr(value)))
358 else:
359 path.add_condition('%s != %s'
360 % (attrib, xpath_repr(value)))
361
362 elif self.operator == '~=':
363 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' ')))
364 elif self.operator == '|=':
365
366 path.add_condition('%s = %s or starts-with(%s, %s)' % (
367 attrib, xpath_repr(value),
368 attrib, xpath_repr(value + '-')))
369 elif self.operator == '^=':
370 path.add_condition('starts-with(%s, %s)' % (
371 attrib, xpath_repr(value)))
372 elif self.operator == '$=':
373
374 path.add_condition('substring(%s, string-length(%s)-%s) = %s'
375 % (attrib, attrib, len(value)-1, xpath_repr(value)))
376 elif self.operator == '*=':
377
378 path.add_condition('contains(%s, %s)' % (
379 attrib, xpath_repr(value)))
380 else:
381 assert 0, ("Unknown operator: %r" % self.operator)
382 return path
383
385 """
386 Represents namespace|element
387 """
388
389 - def __init__(self, namespace, element):
390 self.namespace = namespace
391 self.element = element
392
394 return '%s[%s]' % (
395 self.__class__.__name__,
396 self._format_element())
397
403
405 if self.namespace == '*':
406 el = self.element.lower()
407 else:
408
409 el = '%s:%s' % (self.namespace, self.element)
410 return XPathExpr(element=el)
411
413 """
414 Represents selector#id
415 """
416
418 self.selector = selector
419 self.id = id
420
422 return '%s[%r#%s]' % (
423 self.__class__.__name__,
424 self.selector, self.id)
425
430
432
436 return '%s(%r)' % (
437 self.__class__.__name__,
438 self.items)
439
443
445
446 _method_mapping = {
447 ' ': 'descendant',
448 '>': 'child',
449 '+': 'direct_adjacent',
450 '~': 'indirect_adjacent',
451 }
452
453 - def __init__(self, selector, combinator, subselector):
454 assert selector is not None
455 self.selector = selector
456 self.combinator = combinator
457 self.subselector = subselector
458
460 if self.combinator == ' ':
461 comb = '<followed>'
462 else:
463 comb = self.combinator
464 return '%s[%r %s %r]' % (
465 self.__class__.__name__,
466 self.selector,
467 comb,
468 self.subselector)
469
478
483
488
495
500
501
502
503
504 _el_re = re.compile(r'^\w+\s*$')
505 _id_re = re.compile(r'^(\w*)#(\w+)\s*$')
506 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$')
507
509 if isinstance(css_expr, _basestring):
510 match = _el_re.search(css_expr)
511 if match is not None:
512 return '%s%s' % (prefix, match.group(0).strip())
513 match = _id_re.search(css_expr)
514 if match is not None:
515 return "%s%s[@id = '%s']" % (
516 prefix, match.group(1) or '*', match.group(2))
517 match = _class_re.search(css_expr)
518 if match is not None:
519 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
520 prefix, match.group(1) or '*', match.group(2))
521 css_expr = parse(css_expr)
522 expr = css_expr.xpath()
523 assert expr is not None, (
524 "Got None for xpath expression from %s" % repr(css_expr))
525 if prefix:
526 expr.add_prefix(prefix)
527 return str(expr)
528
530
531 - def __init__(self, prefix=None, path=None, element='*', condition=None,
532 star_prefix=False):
533 self.prefix = prefix
534 self.path = path
535 self.element = element
536 self.condition = condition
537 self.star_prefix = star_prefix
538
549
551 return '%s[%s]' % (
552 self.__class__.__name__, self)
553
555 if self.condition:
556 self.condition = '%s and (%s)' % (self.condition, condition)
557 else:
558 self.condition = condition
559
561 if self.path is None:
562 self.path = self.element
563 else:
564 self.path += self.element
565 self.element = part
566
572
574 if self.element == '*':
575
576 return
577 self.add_condition("name() = %s" % xpath_repr(self.element))
578 self.element = '*'
579
581 """
582 Adds a /* prefix if there is no prefix. This is when you need
583 to keep context's constrained to a single parent.
584 """
585 if self.path:
586 self.path += '*/'
587 else:
588 self.path = '*/'
589 self.star_prefix = True
590
591 - def join(self, combiner, other):
603
605 """
606 Represents |'d expressions. Note that unfortunately it isn't
607 the union, it's the sum, so duplicate elements will appear.
608 """
609
610 - def __init__(self, items, prefix=None):
615
619
628
629
630
631
643
645 result = []
646 while 1:
647 result.append(parse_selector(stream))
648 if stream.peek() == ',':
649 stream.next()
650 else:
651 break
652 if len(result) == 1:
653 return result[0]
654 else:
655 return Or(result)
656
671
673 peek = stream.peek()
674 if peek != '*' and not isinstance(peek, Symbol):
675 element = namespace = '*'
676 else:
677 next = stream.next()
678 if next != '*' and not isinstance(next, Symbol):
679 raise SelectorSyntaxError(
680 "Expected symbol, got %r" % next)
681 if stream.peek() == '|':
682 namespace = next
683 stream.next()
684 element = stream.next()
685 if element != '*' and not isinstance(next, Symbol):
686 raise SelectorSyntaxError(
687 "Expected symbol, got %r" % next)
688 else:
689 namespace = '*'
690 element = next
691 result = Element(namespace, element)
692 has_hash = False
693 while 1:
694 peek = stream.peek()
695 if peek == '#':
696 if has_hash:
697
698
699 break
700 stream.next()
701 result = Hash(result, stream.next())
702 has_hash = True
703 continue
704 elif peek == '.':
705 stream.next()
706 result = Class(result, stream.next())
707 continue
708 elif peek == '[':
709 stream.next()
710 result = parse_attrib(result, stream)
711 next = stream.next()
712 if not next == ']':
713 raise SelectorSyntaxError(
714 "] expected, got %r" % next)
715 continue
716 elif peek == ':' or peek == '::':
717 type = stream.next()
718 ident = stream.next()
719 if not isinstance(ident, Symbol):
720 raise SelectorSyntaxError(
721 "Expected symbol, got %r" % ident)
722 if stream.peek() == '(':
723 stream.next()
724 peek = stream.peek()
725 if isinstance(peek, String):
726 selector = stream.next()
727 elif isinstance(peek, Symbol) and is_int(peek):
728 selector = int(stream.next())
729 else:
730
731 selector = parse_simple_selector(stream)
732 next = stream.next()
733 if not next == ')':
734 raise SelectorSyntaxError(
735 "Expected ), got %r and %r"
736 % (next, selector))
737 result = Function(result, type, ident, selector)
738 else:
739 result = Pseudo(result, type, ident)
740 continue
741 else:
742 if peek == ' ':
743 stream.next()
744 break
745
746 return result
747
749 try:
750 int(v)
751 except ValueError:
752 return False
753 else:
754 return True
755
757 attrib = stream.next()
758 if stream.peek() == '|':
759 namespace = attrib
760 stream.next()
761 attrib = stream.next()
762 else:
763 namespace = '*'
764 if stream.peek() == ']':
765 return Attrib(selector, namespace, attrib, 'exists', None)
766 op = stream.next()
767 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
768 raise SelectorSyntaxError(
769 "Operator expected, got %r" % op)
770 value = stream.next()
771 if not isinstance(value, (Symbol, String)):
772 raise SelectorSyntaxError(
773 "Expected string or symbol, got %r" % value)
774 return Attrib(selector, namespace, attrib, op, value)
775
777 """
778 Parses things like '1n+2', or 'an+b' generally, returning (a, b)
779 """
780 if isinstance(s, Element):
781 s = s._format_element()
782 if not s or s == '*':
783
784 return (0, 0)
785 if isinstance(s, int):
786
787 return (0, s)
788 if s == 'odd':
789 return (2, 1)
790 elif s == 'even':
791 return (2, 0)
792 elif s == 'n':
793 return (1, 0)
794 if 'n' not in s:
795
796 return (0, int(s))
797 a, b = s.split('n', 1)
798 if not a:
799 a = 1
800 elif a == '-' or a == '+':
801 a = int(a+'1')
802 else:
803 a = int(a)
804 if not b:
805 b = 0
806 elif b == '-' or b == '+':
807 b = int(b+'1')
808 else:
809 b = int(b)
810 return (a, b)
811
812
813
814
815
816
817 _whitespace_re = re.compile(r'\s+')
818
819 _comment_re = re.compile(r'/\*.*?\*/', re.S)
820
821 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
822
824 pos = 0
825 s = _comment_re.sub('', s)
826 while 1:
827 match = _whitespace_re.match(s, pos=pos)
828 if match:
829 preceding_whitespace_pos = pos
830 pos = match.end()
831 else:
832 preceding_whitespace_pos = 0
833 if pos >= len(s):
834 return
835 match = _count_re.match(s, pos=pos)
836 if match and match.group() != 'n':
837 sym = s[pos:match.end()]
838 yield Symbol(sym, pos)
839 pos = match.end()
840 continue
841 c = s[pos]
842 c2 = s[pos:pos+2]
843 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
844 yield Token(c2, pos)
845 pos += 2
846 continue
847 if c in '>+~,.*=[]()|:#':
848 if c in '.#' and preceding_whitespace_pos > 0:
849 yield Token(' ', preceding_whitespace_pos)
850 yield Token(c, pos)
851 pos += 1
852 continue
853 if c == '"' or c == "'":
854
855 old_pos = pos
856 sym, pos = tokenize_escaped_string(s, pos)
857 yield String(sym, old_pos)
858 continue
859 old_pos = pos
860 sym, pos = tokenize_symbol(s, pos)
861 yield Symbol(sym, old_pos)
862 continue
863
865 quote = s[pos]
866 assert quote in ('"', "'")
867 pos = pos+1
868 start = pos
869 while 1:
870 next = s.find(quote, pos)
871 if next == -1:
872 raise SelectorSyntaxError(
873 "Expected closing %s for string in: %r"
874 % (quote, s[start:]))
875 result = s[start:next]
876 try:
877 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
878 except UnicodeDecodeError:
879
880 pos = next+1
881 else:
882 return result, next+1
883
884 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
885
887 start = pos
888 match = _illegal_symbol.search(s, pos=pos)
889 if not match:
890
891 return s[start:], len(s)
892 if match.start() == pos:
893 assert 0, (
894 "Unexpected symbol: %r at %s" % (s[pos], pos))
895 if not match:
896 result = s[start:]
897 pos = len(s)
898 else:
899 result = s[start:match.start()]
900 pos = match.start()
901 try:
902 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
903 except UnicodeDecodeError:
904 import sys
905 e = sys.exc_info()[1]
906 raise SelectorSyntaxError(
907 "Bad symbol %r: %s" % (result, e))
908 return result, pos
909
911
912 - def __init__(self, tokens, source=None):
913 self.used = []
914 self.tokens = iter(tokens)
915 self.source = source
916 self.peeked = None
917 self._peeking = False
918 try:
919 self.next_token = self.tokens.next
920 except AttributeError:
921
922 self.next_token = self.tokens.__next__
923
925 if self._peeking:
926 self._peeking = False
927 self.used.append(self.peeked)
928 return self.peeked
929 else:
930 try:
931 next = self.next_token()
932 self.used.append(next)
933 return next
934 except StopIteration:
935 return None
936
939
941 if not self._peeking:
942 try:
943 self.peeked = self.next_token()
944 except StopIteration:
945 return None
946 self._peeking = True
947 return self.peeked
948