1 """CSS Selectors based on XPath.
2
3 This module supports selecting XML/HTML tags based on CSS selectors.
4 See the `CSSSelector` class for details.
5 """
6
7 import re
8 from lxml import etree
9
10 __all__ = ['SelectorSyntaxError', 'ExpressionError',
11 'CSSSelector']
12
13 try:
14 _basestring = basestring
15 except NameError:
16 _basestring = str
17
20
23
25 """A CSS selector.
26
27 Usage::
28
29 >>> from lxml import etree, cssselect
30 >>> select = cssselect.CSSSelector("a tag > child")
31
32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
33 >>> [ el.tag for el in select(root) ]
34 ['child']
35
36 To use CSS namespaces, you need to pass a prefix-to-namespace
37 mapping as ``namespaces`` keyword argument::
38
39 >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
40 >>> select_ns = cssselect.CSSSelector('root > rdf|Description',
41 ... namespaces={'rdf': rdfns})
42
43 >>> rdf = etree.XML((
44 ... '<root xmlns:rdf="%s">'
45 ... '<rdf:Description>blah</rdf:Description>'
46 ... '</root>') % rdfns)
47 >>> [(el.tag, el.text) for el in select_ns(rdf)]
48 [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
49 """
50 - def __init__(self, css, namespaces=None):
54
56 return '<%s %s for %r>' % (
57 self.__class__.__name__,
58 hex(abs(id(self)))[2:],
59 self.css)
60
61
62
63
64 try:
65 _unicode = unicode
66 _unichr = unichr
67 except NameError:
68
69 _unicode = str
70 _unichr = chr
71
74 obj = _unicode.__new__(cls, contents)
75 obj.pos = pos
76 return obj
77
79 return '%s(%s, %r)' % (
80 self.__class__.__name__,
81 _unicode.__repr__(self),
82 self.pos)
83
86
89
92
93
94
95
96
97
98
99
101 """
102 Represents selector.class_name
103 """
104
105 - def __init__(self, selector, class_name):
106 self.selector = selector
107 self.class_name = class_name
108
110 return '%s[%r.%s]' % (
111 self.__class__.__name__,
112 self.selector,
113 self.class_name)
114
116 sel_xpath = self.selector.xpath()
117 sel_xpath.add_condition(
118 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' '))
119 return sel_xpath
120
122 """
123 Represents selector:name(expr)
124 """
125
126 unsupported = [
127 'target', 'lang', 'enabled', 'disabled',]
128
129 - def __init__(self, selector, type, name, expr):
130 self.selector = selector
131 self.type = type
132 self.name = name
133 self.expr = expr
134
136 return '%s[%r%s%s(%r)]' % (
137 self.__class__.__name__,
138 self.selector,
139 self.type, self.name, self.expr)
140
152
155 a, b = parse_series(expr)
156 if not a and not b and not last:
157
158 xpath.add_condition('false() and position() = 0')
159 return xpath
160 if add_name_test:
161 xpath.add_name_test()
162 xpath.add_star_prefix()
163 if a == 0:
164 if last:
165 b = 'last() - %s' % b
166 xpath.add_condition('position() = %s' % b)
167 return xpath
168 if last:
169
170 a = -a
171 b = -b
172 if b > 0:
173 b_neg = str(-b)
174 else:
175 b_neg = '+%s' % (-b)
176 if a != 1:
177 expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
178 else:
179 expr = []
180 if b >= 0:
181 expr.append('position() >= %s' % b)
182 elif b < 0 and last:
183 expr.append('position() < (last() %s)' % b)
184 expr = ' and '.join(expr)
185 if expr:
186 xpath.add_condition(expr)
187 return xpath
188
189
190
191
192
193
194
195
198
204
207
216
224
227
228 ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
229 ns.prefix = 'css'
230 ns['lower-case'] = _make_lower_case
231
233 """
234 Represents selector:ident
235 """
236
237 unsupported = ['indeterminate', 'first-line', 'first-letter',
238 'selection', 'before', 'after', 'link', 'visited',
239 'active', 'focus', 'hover']
240
241 - def __init__(self, element, type, ident):
242 self.element = element
243 assert type in (':', '::')
244 self.type = type
245 self.ident = ident
246
248 return '%s[%r%s%s]' % (
249 self.__class__.__name__,
250 self.element,
251 self.type, self.ident)
252
265
267
268 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
269 return xpath
270
272
273 raise NotImplementedError
274
280
286
294
302
308
310 if xpath.element == '*':
311 raise NotImplementedError(
312 "*:only-of-type is not implemented")
313 xpath.add_condition('last() = 1')
314 return xpath
315
319
321 """
322 Represents selector[namespace|attrib operator value]
323 """
324
325 - def __init__(self, selector, namespace, attrib, operator, value):
331
333 if self.operator == 'exists':
334 return '%s[%r[%s]]' % (
335 self.__class__.__name__,
336 self.selector,
337 self._format_attrib())
338 else:
339 return '%s[%r[%s %s %r]]' % (
340 self.__class__.__name__,
341 self.selector,
342 self._format_attrib(),
343 self.operator,
344 self.value)
345
351
358
360 path = self.selector.xpath()
361 attrib = self._xpath_attrib()
362 value = self.value
363 if self.operator == 'exists':
364 assert not value
365 path.add_condition(attrib)
366 elif self.operator == '=':
367 path.add_condition('%s = %s' % (attrib,
368 xpath_literal(value)))
369 elif self.operator == '!=':
370
371 if value:
372 path.add_condition('not(%s) or %s != %s'
373 % (attrib, attrib, xpath_literal(value)))
374 else:
375 path.add_condition('%s != %s'
376 % (attrib, xpath_literal(value)))
377
378 elif self.operator == '~=':
379 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
380 elif self.operator == '|=':
381
382 path.add_condition('%s = %s or starts-with(%s, %s)' % (
383 attrib, xpath_literal(value),
384 attrib, xpath_literal(value + '-')))
385 elif self.operator == '^=':
386 path.add_condition('starts-with(%s, %s)' % (
387 attrib, xpath_literal(value)))
388 elif self.operator == '$=':
389
390 path.add_condition('substring(%s, string-length(%s)-%s) = %s'
391 % (attrib, attrib, len(value)-1, xpath_literal(value)))
392 elif self.operator == '*=':
393
394 path.add_condition('contains(%s, %s)' % (
395 attrib, xpath_literal(value)))
396 else:
397 assert 0, ("Unknown operator: %r" % self.operator)
398 return path
399
401 """
402 Represents namespace|element
403 """
404
405 - def __init__(self, namespace, element):
408
410 return '%s[%s]' % (
411 self.__class__.__name__,
412 self._format_element())
413
419
421 if self.namespace == '*':
422 el = self.element.lower()
423 else:
424
425 el = '%s:%s' % (self.namespace, self.element)
426 return XPathExpr(element=el)
427
429 """
430 Represents selector#id
431 """
432
434 self.selector = selector
435 self.id = id
436
438 return '%s[%r#%s]' % (
439 self.__class__.__name__,
440 self.selector, self.id)
441
446
448
452 return '%s(%r)' % (
453 self.__class__.__name__,
454 self.items)
455
459
461
462 _method_mapping = {
463 ' ': 'descendant',
464 '>': 'child',
465 '+': 'direct_adjacent',
466 '~': 'indirect_adjacent',
467 }
468
469 - def __init__(self, selector, combinator, subselector):
470 assert selector is not None
471 self.selector = selector
472 self.combinator = combinator
473 self.subselector = subselector
474
476 if self.combinator == ' ':
477 comb = '<followed>'
478 else:
479 comb = self.combinator
480 return '%s[%r %s %r]' % (
481 self.__class__.__name__,
482 self.selector,
483 comb,
484 self.subselector)
485
494
499
504
511
516
517
518
519
520 _el_re = re.compile(r'^\w+\s*$', re.UNICODE)
521 _id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
522 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
523
525 if isinstance(css_expr, _basestring):
526 match = _el_re.search(css_expr)
527 if match is not None:
528 return '%s%s' % (prefix, match.group(0).strip())
529 match = _id_re.search(css_expr)
530 if match is not None:
531 return "%s%s[@id = '%s']" % (
532 prefix, match.group(1) or '*', match.group(2))
533 match = _class_re.search(css_expr)
534 if match is not None:
535 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
536 prefix, match.group(1) or '*', match.group(2))
537 css_expr = parse(css_expr)
538 expr = css_expr.xpath()
539 assert expr is not None, (
540 "Got None for xpath expression from %s" % repr(css_expr))
541 if prefix:
542 expr.add_prefix(prefix)
543 return _unicode(expr)
544
546
547 - def __init__(self, prefix=None, path=None, element='*', condition=None,
548 star_prefix=False):
549 self.prefix = prefix
550 self.path = path
551 self.element = element
552 self.condition = condition
553 self.star_prefix = star_prefix
554
556 path = ''
557 if self.prefix is not None:
558 path += _unicode(self.prefix)
559 if self.path is not None:
560 path += _unicode(self.path)
561 path += _unicode(self.element)
562 if self.condition:
563 path += '[%s]' % self.condition
564 return path
565
567 return '%s[%s]' % (
568 self.__class__.__name__, self)
569
571 if self.condition:
572 self.condition = '%s and (%s)' % (self.condition, condition)
573 else:
574 self.condition = condition
575
577 if self.path is None:
578 self.path = self.element
579 else:
580 self.path += self.element
581 self.element = part
582
588
590 if self.element == '*':
591
592 return
593 self.add_condition("name() = %s" % xpath_literal(self.element))
594 self.element = '*'
595
597 """
598 Adds a /* prefix if there is no prefix. This is when you need
599 to keep context's constrained to a single parent.
600 """
601 if self.path:
602 self.path += '*/'
603 else:
604 self.path = '*/'
605 self.star_prefix = True
606
607 - def join(self, combiner, other):
608 prefix = _unicode(self)
609 prefix += combiner
610 path = (other.prefix or '') + (other.path or '')
611
612
613 if other.star_prefix and path == '*/':
614 path = ''
615 self.prefix = prefix
616 self.path = path
617 self.element = other.element
618 self.condition = other.condition
619
621 """
622 Represents |'d expressions. Note that unfortunately it isn't
623 the union, it's the sum, so duplicate elements will appear.
624 """
625
626 - def __init__(self, items, prefix=None):
631
635
636 split_at_single_quotes = re.compile("('+)").split
637
639 if isinstance(s, Element):
640
641 s = s._format_element()
642 else:
643 s = _unicode(s)
644 if "'" not in s:
645 s = "'%s'" % s
646 elif '"' not in s:
647 s = '"%s"' % s
648 else:
649 s = "concat(%s)" % ','.join([
650 (("'" in part) and '"%s"' or "'%s'") % part
651 for part in split_at_single_quotes(s) if part
652 ])
653 return s
654
655
656
657
673
675 result = []
676 while 1:
677 result.append(parse_selector(stream))
678 if stream.peek() == ',':
679 stream.next()
680 else:
681 break
682 if len(result) == 1:
683 return result[0]
684 else:
685 return Or(result)
686
688 result = parse_simple_selector(stream)
689 while 1:
690 peek = stream.peek()
691 if peek == ',' or peek is None:
692 return result
693 elif peek in ('+', '>', '~'):
694
695 combinator = stream.next()
696
697 while stream.peek() == ' ':
698 stream.next()
699 else:
700 combinator = ' '
701 consumed = len(stream.used)
702 next_selector = parse_simple_selector(stream)
703 if consumed == len(stream.used):
704 raise SelectorSyntaxError(
705 "Expected selector, got '%s'" % stream.peek())
706 result = CombinedSelector(result, combinator, next_selector)
707 return result
708
710 peek = stream.peek()
711 if peek != '*' and not isinstance(peek, Symbol):
712 element = namespace = '*'
713 else:
714 next = stream.next()
715 if next != '*' and not isinstance(next, Symbol):
716 raise SelectorSyntaxError(
717 "Expected symbol, got '%s'" % next)
718 if stream.peek() == '|':
719 namespace = next
720 stream.next()
721 element = stream.next()
722 if element != '*' and not isinstance(next, Symbol):
723 raise SelectorSyntaxError(
724 "Expected symbol, got '%s'" % next)
725 else:
726 namespace = '*'
727 element = next
728 result = Element(namespace, element)
729 has_hash = False
730 while 1:
731 peek = stream.peek()
732 if peek == '#':
733 if has_hash:
734
735
736 break
737 stream.next()
738 result = Hash(result, stream.next())
739 has_hash = True
740 continue
741 elif peek == '.':
742 stream.next()
743 result = Class(result, stream.next())
744 continue
745 elif peek == '[':
746 stream.next()
747 result = parse_attrib(result, stream)
748 next = stream.next()
749 if not next == ']':
750 raise SelectorSyntaxError(
751 "] expected, got '%s'" % next)
752 continue
753 elif peek == ':' or peek == '::':
754 type = stream.next()
755 ident = stream.next()
756 if not isinstance(ident, Symbol):
757 raise SelectorSyntaxError(
758 "Expected symbol, got '%s'" % ident)
759 if stream.peek() == '(':
760 stream.next()
761 peek = stream.peek()
762 if isinstance(peek, String):
763 selector = stream.next()
764 elif isinstance(peek, Symbol) and is_int(peek):
765 selector = int(stream.next())
766 else:
767
768 selector = parse_simple_selector(stream)
769 next = stream.next()
770 if not next == ')':
771 raise SelectorSyntaxError(
772 "Expected ')', got '%s' and '%s'"
773 % (next, selector))
774 result = Function(result, type, ident, selector)
775 else:
776 result = Pseudo(result, type, ident)
777 continue
778 else:
779 if peek == ' ':
780 stream.next()
781 break
782
783 return result
784
786 try:
787 int(v)
788 except ValueError:
789 return False
790 else:
791 return True
792
794 attrib = stream.next()
795 if stream.peek() == '|':
796 namespace = attrib
797 stream.next()
798 attrib = stream.next()
799 else:
800 namespace = '*'
801 if stream.peek() == ']':
802 return Attrib(selector, namespace, attrib, 'exists', None)
803 op = stream.next()
804 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
805 raise SelectorSyntaxError(
806 "Operator expected, got '%s'" % op)
807 value = stream.next()
808 if not isinstance(value, (Symbol, String)):
809 raise SelectorSyntaxError(
810 "Expected string or symbol, got '%s'" % value)
811 return Attrib(selector, namespace, attrib, op, value)
812
814 """
815 Parses things like '1n+2', or 'an+b' generally, returning (a, b)
816 """
817 if isinstance(s, Element):
818 s = s._format_element()
819 if not s or s == '*':
820
821 return (0, 0)
822 if isinstance(s, int):
823
824 return (0, s)
825 if s == 'odd':
826 return (2, 1)
827 elif s == 'even':
828 return (2, 0)
829 elif s == 'n':
830 return (1, 0)
831 if 'n' not in s:
832
833 return (0, int(s))
834 a, b = s.split('n', 1)
835 if not a:
836 a = 1
837 elif a == '-' or a == '+':
838 a = int(a+'1')
839 else:
840 a = int(a)
841 if not b:
842 b = 0
843 elif b == '-' or b == '+':
844 b = int(b+'1')
845 else:
846 b = int(b)
847 return (a, b)
848
849
850
851
852
853
854 _match_whitespace = re.compile(r'\s+', re.UNICODE).match
855
856 _replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub
857
858 _match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match
859
861 pos = 0
862 s = _replace_comments('', s)
863 while 1:
864 match = _match_whitespace(s, pos=pos)
865 if match:
866 preceding_whitespace_pos = pos
867 pos = match.end()
868 else:
869 preceding_whitespace_pos = 0
870 if pos >= len(s):
871 return
872 match = _match_count_number(s, pos=pos)
873 if match and match.group() != 'n':
874 sym = s[pos:match.end()]
875 yield Symbol(sym, pos)
876 pos = match.end()
877 continue
878 c = s[pos]
879 c2 = s[pos:pos+2]
880 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
881 if c2 == '::' and preceding_whitespace_pos > 0:
882 yield Token(' ', preceding_whitespace_pos)
883 yield Token(c2, pos)
884 pos += 2
885 continue
886 if c in '>+~,.*=[]()|:#':
887 if c in ':.#[' and preceding_whitespace_pos > 0:
888 yield Token(' ', preceding_whitespace_pos)
889 yield Token(c, pos)
890 pos += 1
891 continue
892 if c == '"' or c == "'":
893
894 old_pos = pos
895 sym, pos = tokenize_escaped_string(s, pos)
896 yield String(sym, old_pos)
897 continue
898 old_pos = pos
899 sym, pos = tokenize_symbol(s, pos)
900 yield Symbol(sym, old_pos)
901 continue
902
903 split_at_string_escapes = re.compile(r'(\\(?:%s))'
904 % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
905 '[^A-Fa-f0-9]'])).split
906
908 substrings = []
909 for substring in split_at_string_escapes(literal):
910 if not substring:
911 continue
912 elif '\\' in substring:
913 if substring[0] == '\\' and len(substring) > 1:
914 substring = substring[1:]
915 if substring[0] in '0123456789ABCDEFabcdef':
916
917 substring = _unichr(int(substring, 16))
918 else:
919 raise SelectorSyntaxError(
920 "Invalid escape sequence %r in string %r"
921 % (substring.split('\\')[1], literal))
922 substrings.append(substring)
923 return ''.join(substrings)
924
926 quote = s[pos]
927 assert quote in ('"', "'")
928 pos = pos+1
929 start = pos
930 while 1:
931 next = s.find(quote, pos)
932 if next == -1:
933 raise SelectorSyntaxError(
934 "Expected closing %s for string in: %r"
935 % (quote, s[start:]))
936 result = s[start:next]
937 if result.endswith('\\'):
938
939 pos = next+1
940 continue
941 if '\\' in result:
942 result = unescape_string_literal(result)
943 return result, next+1
944
945 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
946
948 start = pos
949 match = _illegal_symbol.search(s, pos=pos)
950 if not match:
951
952 return s[start:], len(s)
953 if match.start() == pos:
954 assert 0, (
955 "Unexpected symbol: %r at %s" % (s[pos], pos))
956 if not match:
957 result = s[start:]
958 pos = len(s)
959 else:
960 result = s[start:match.start()]
961 pos = match.start()
962 try:
963 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
964 except UnicodeDecodeError:
965 import sys
966 e = sys.exc_info()[1]
967 raise SelectorSyntaxError(
968 "Bad symbol %r: %s" % (result, e))
969 return result, pos
970
972
973 - def __init__(self, tokens, source=None):
974 self.used = []
975 self.tokens = iter(tokens)
976 self.source = source
977 self.peeked = None
978 self._peeking = False
979 try:
980 self.next_token = self.tokens.next
981 except AttributeError:
982
983 self.next_token = self.tokens.__next__
984
986 if self._peeking:
987 self._peeking = False
988 self.used.append(self.peeked)
989 return self.peeked
990 else:
991 try:
992 next = self.next_token()
993 self.used.append(next)
994 return next
995 except StopIteration:
996 return None
997
1000
1002 if not self._peeking:
1003 try:
1004 self.peeked = self.next_token()
1005 except StopIteration:
1006 return None
1007 self._peeking = True
1008 return self.peeked
1009