1 """CSS Selectors based on XPath.
2
3 This module supports selecting XML/HTML tags based on CSS selectors.
4 See the `CSSSelector` class for details.
5 """
6
7 import re
8 from lxml import etree
9
10 __all__ = ['SelectorSyntaxError', 'ExpressionError',
11 'CSSSelector']
12
13 try:
14 _basestring = basestring
15 except NameError:
16 _basestring = str
17
20
23
25 """A CSS selector.
26
27 Usage::
28
29 >>> from lxml import etree, cssselect
30 >>> select = cssselect.CSSSelector("a tag > child")
31
32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
33 >>> [ el.tag for el in select(root) ]
34 ['child']
35 """
40
42 return '<%s %s for %r>' % (
43 self.__class__.__name__,
44 hex(abs(id(self)))[2:],
45 self.css)
46
47
48
49
50 try:
51 _unicode = unicode
52 _unichr = unichr
53 except NameError:
54
55 _unicode = str
56 _unichr = chr
57
60 obj = _unicode.__new__(cls, contents)
61 obj.pos = pos
62 return obj
63
65 return '%s(%s, %r)' % (
66 self.__class__.__name__,
67 _unicode.__repr__(self),
68 self.pos)
69
72
75
78
79
80
81
82
83
84
85
87 """
88 Represents selector.class_name
89 """
90
91 - def __init__(self, selector, class_name):
92 self.selector = selector
93 self.class_name = class_name
94
96 return '%s[%r.%s]' % (
97 self.__class__.__name__,
98 self.selector,
99 self.class_name)
100
102 sel_xpath = self.selector.xpath()
103 sel_xpath.add_condition(
104 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' '))
105 return sel_xpath
106
108 """
109 Represents selector:name(expr)
110 """
111
112 unsupported = [
113 'target', 'lang', 'enabled', 'disabled',]
114
115 - def __init__(self, selector, type, name, expr):
116 self.selector = selector
117 self.type = type
118 self.name = name
119 self.expr = expr
120
122 return '%s[%r%s%s(%r)]' % (
123 self.__class__.__name__,
124 self.selector,
125 self.type, self.name, self.expr)
126
138
141 a, b = parse_series(expr)
142 if not a and not b and not last:
143
144 xpath.add_condition('false() and position() = 0')
145 return xpath
146 if add_name_test:
147 xpath.add_name_test()
148 xpath.add_star_prefix()
149 if a == 0:
150 if last:
151 b = 'last() - %s' % b
152 xpath.add_condition('position() = %s' % b)
153 return xpath
154 if last:
155
156 a = -a
157 b = -b
158 if b > 0:
159 b_neg = str(-b)
160 else:
161 b_neg = '+%s' % (-b)
162 if a != 1:
163 expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
164 else:
165 expr = []
166 if b >= 0:
167 expr.append('position() >= %s' % b)
168 elif b < 0 and last:
169 expr.append('position() < (last() %s)' % b)
170 expr = ' and '.join(expr)
171 if expr:
172 xpath.add_condition(expr)
173 return xpath
174
175
176
177
178
179
180
181
184
190
193
202
210
213
214 ns = etree.FunctionNamespace('/css/')
215 ns.prefix = 'css'
216 ns['lower-case'] = _make_lower_case
217
219 """
220 Represents selector:ident
221 """
222
223 unsupported = ['indeterminate', 'first-line', 'first-letter',
224 'selection', 'before', 'after', 'link', 'visited',
225 'active', 'focus', 'hover']
226
227 - def __init__(self, element, type, ident):
228 self.element = element
229 assert type in (':', '::')
230 self.type = type
231 self.ident = ident
232
234 return '%s[%r%s%s]' % (
235 self.__class__.__name__,
236 self.element,
237 self.type, self.ident)
238
251
253
254 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
255 return xpath
256
258
259 raise NotImplementedError
260
266
272
280
288
294
296 if xpath.element == '*':
297 raise NotImplementedError(
298 "*:only-of-type is not implemented")
299 xpath.add_condition('last() = 1')
300 return xpath
301
305
307 """
308 Represents selector[namespace|attrib operator value]
309 """
310
311 - def __init__(self, selector, namespace, attrib, operator, value):
317
319 if self.operator == 'exists':
320 return '%s[%r[%s]]' % (
321 self.__class__.__name__,
322 self.selector,
323 self._format_attrib())
324 else:
325 return '%s[%r[%s %s %r]]' % (
326 self.__class__.__name__,
327 self.selector,
328 self._format_attrib(),
329 self.operator,
330 self.value)
331
337
344
346 path = self.selector.xpath()
347 attrib = self._xpath_attrib()
348 value = self.value
349 if self.operator == 'exists':
350 assert not value
351 path.add_condition(attrib)
352 elif self.operator == '=':
353 path.add_condition('%s = %s' % (attrib,
354 xpath_literal(value)))
355 elif self.operator == '!=':
356
357 if value:
358 path.add_condition('not(%s) or %s != %s'
359 % (attrib, attrib, xpath_literal(value)))
360 else:
361 path.add_condition('%s != %s'
362 % (attrib, xpath_literal(value)))
363
364 elif self.operator == '~=':
365 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
366 elif self.operator == '|=':
367
368 path.add_condition('%s = %s or starts-with(%s, %s)' % (
369 attrib, xpath_literal(value),
370 attrib, xpath_literal(value + '-')))
371 elif self.operator == '^=':
372 path.add_condition('starts-with(%s, %s)' % (
373 attrib, xpath_literal(value)))
374 elif self.operator == '$=':
375
376 path.add_condition('substring(%s, string-length(%s)-%s) = %s'
377 % (attrib, attrib, len(value)-1, xpath_literal(value)))
378 elif self.operator == '*=':
379
380 path.add_condition('contains(%s, %s)' % (
381 attrib, xpath_literal(value)))
382 else:
383 assert 0, ("Unknown operator: %r" % self.operator)
384 return path
385
387 """
388 Represents namespace|element
389 """
390
391 - def __init__(self, namespace, element):
394
396 return '%s[%s]' % (
397 self.__class__.__name__,
398 self._format_element())
399
405
407 if self.namespace == '*':
408 el = self.element.lower()
409 else:
410
411 el = '%s:%s' % (self.namespace, self.element)
412 return XPathExpr(element=el)
413
415 """
416 Represents selector#id
417 """
418
420 self.selector = selector
421 self.id = id
422
424 return '%s[%r#%s]' % (
425 self.__class__.__name__,
426 self.selector, self.id)
427
432
434
438 return '%s(%r)' % (
439 self.__class__.__name__,
440 self.items)
441
445
447
448 _method_mapping = {
449 ' ': 'descendant',
450 '>': 'child',
451 '+': 'direct_adjacent',
452 '~': 'indirect_adjacent',
453 }
454
455 - def __init__(self, selector, combinator, subselector):
456 assert selector is not None
457 self.selector = selector
458 self.combinator = combinator
459 self.subselector = subselector
460
462 if self.combinator == ' ':
463 comb = '<followed>'
464 else:
465 comb = self.combinator
466 return '%s[%r %s %r]' % (
467 self.__class__.__name__,
468 self.selector,
469 comb,
470 self.subselector)
471
480
485
490
497
502
503
504
505
506 _el_re = re.compile(r'^\w+\s*$', re.UNICODE)
507 _id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
508 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
509
511 if isinstance(css_expr, _basestring):
512 match = _el_re.search(css_expr)
513 if match is not None:
514 return '%s%s' % (prefix, match.group(0).strip())
515 match = _id_re.search(css_expr)
516 if match is not None:
517 return "%s%s[@id = '%s']" % (
518 prefix, match.group(1) or '*', match.group(2))
519 match = _class_re.search(css_expr)
520 if match is not None:
521 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
522 prefix, match.group(1) or '*', match.group(2))
523 css_expr = parse(css_expr)
524 expr = css_expr.xpath()
525 assert expr is not None, (
526 "Got None for xpath expression from %s" % repr(css_expr))
527 if prefix:
528 expr.add_prefix(prefix)
529 return _unicode(expr)
530
532
533 - def __init__(self, prefix=None, path=None, element='*', condition=None,
534 star_prefix=False):
535 self.prefix = prefix
536 self.path = path
537 self.element = element
538 self.condition = condition
539 self.star_prefix = star_prefix
540
542 path = ''
543 if self.prefix is not None:
544 path += _unicode(self.prefix)
545 if self.path is not None:
546 path += _unicode(self.path)
547 path += _unicode(self.element)
548 if self.condition:
549 path += '[%s]' % self.condition
550 return path
551
553 return '%s[%s]' % (
554 self.__class__.__name__, self)
555
557 if self.condition:
558 self.condition = '%s and (%s)' % (self.condition, condition)
559 else:
560 self.condition = condition
561
563 if self.path is None:
564 self.path = self.element
565 else:
566 self.path += self.element
567 self.element = part
568
574
576 if self.element == '*':
577
578 return
579 self.add_condition("name() = %s" % xpath_literal(self.element))
580 self.element = '*'
581
583 """
584 Adds a /* prefix if there is no prefix. This is when you need
585 to keep context's constrained to a single parent.
586 """
587 if self.path:
588 self.path += '*/'
589 else:
590 self.path = '*/'
591 self.star_prefix = True
592
593 - def join(self, combiner, other):
594 prefix = _unicode(self)
595 prefix += combiner
596 path = (other.prefix or '') + (other.path or '')
597
598
599 if other.star_prefix and path == '*/':
600 path = ''
601 self.prefix = prefix
602 self.path = path
603 self.element = other.element
604 self.condition = other.condition
605
607 """
608 Represents |'d expressions. Note that unfortunately it isn't
609 the union, it's the sum, so duplicate elements will appear.
610 """
611
612 - def __init__(self, items, prefix=None):
617
621
622 split_at_single_quotes = re.compile("('+)").split
623
625 if isinstance(s, Element):
626
627 s = s._format_element()
628 else:
629 s = _unicode(s)
630 if "'" not in s:
631 s = "'%s'" % s
632 elif '"' not in s:
633 s = '"%s"' % s
634 else:
635 s = "concat(%s)" % ','.join([
636 (("'" in part) and '"%s"' or "'%s'") % part
637 for part in split_at_single_quotes(s) if part
638 ])
639 return s
640
641
642
643
659
661 result = []
662 while 1:
663 result.append(parse_selector(stream))
664 if stream.peek() == ',':
665 stream.next()
666 else:
667 break
668 if len(result) == 1:
669 return result[0]
670 else:
671 return Or(result)
672
691
693 peek = stream.peek()
694 if peek != '*' and not isinstance(peek, Symbol):
695 element = namespace = '*'
696 else:
697 next = stream.next()
698 if next != '*' and not isinstance(next, Symbol):
699 raise SelectorSyntaxError(
700 "Expected symbol, got '%s'" % next)
701 if stream.peek() == '|':
702 namespace = next
703 stream.next()
704 element = stream.next()
705 if element != '*' and not isinstance(next, Symbol):
706 raise SelectorSyntaxError(
707 "Expected symbol, got '%s'" % next)
708 else:
709 namespace = '*'
710 element = next
711 result = Element(namespace, element)
712 has_hash = False
713 while 1:
714 peek = stream.peek()
715 if peek == '#':
716 if has_hash:
717
718
719 break
720 stream.next()
721 result = Hash(result, stream.next())
722 has_hash = True
723 continue
724 elif peek == '.':
725 stream.next()
726 result = Class(result, stream.next())
727 continue
728 elif peek == '[':
729 stream.next()
730 result = parse_attrib(result, stream)
731 next = stream.next()
732 if not next == ']':
733 raise SelectorSyntaxError(
734 "] expected, got '%s'" % next)
735 continue
736 elif peek == ':' or peek == '::':
737 type = stream.next()
738 ident = stream.next()
739 if not isinstance(ident, Symbol):
740 raise SelectorSyntaxError(
741 "Expected symbol, got '%s'" % ident)
742 if stream.peek() == '(':
743 stream.next()
744 peek = stream.peek()
745 if isinstance(peek, String):
746 selector = stream.next()
747 elif isinstance(peek, Symbol) and is_int(peek):
748 selector = int(stream.next())
749 else:
750
751 selector = parse_simple_selector(stream)
752 next = stream.next()
753 if not next == ')':
754 raise SelectorSyntaxError(
755 "Expected ')', got '%s' and '%s'"
756 % (next, selector))
757 result = Function(result, type, ident, selector)
758 else:
759 result = Pseudo(result, type, ident)
760 continue
761 else:
762 if peek == ' ':
763 stream.next()
764 break
765
766 return result
767
769 try:
770 int(v)
771 except ValueError:
772 return False
773 else:
774 return True
775
777 attrib = stream.next()
778 if stream.peek() == '|':
779 namespace = attrib
780 stream.next()
781 attrib = stream.next()
782 else:
783 namespace = '*'
784 if stream.peek() == ']':
785 return Attrib(selector, namespace, attrib, 'exists', None)
786 op = stream.next()
787 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
788 raise SelectorSyntaxError(
789 "Operator expected, got '%s'" % op)
790 value = stream.next()
791 if not isinstance(value, (Symbol, String)):
792 raise SelectorSyntaxError(
793 "Expected string or symbol, got '%s'" % value)
794 return Attrib(selector, namespace, attrib, op, value)
795
797 """
798 Parses things like '1n+2', or 'an+b' generally, returning (a, b)
799 """
800 if isinstance(s, Element):
801 s = s._format_element()
802 if not s or s == '*':
803
804 return (0, 0)
805 if isinstance(s, int):
806
807 return (0, s)
808 if s == 'odd':
809 return (2, 1)
810 elif s == 'even':
811 return (2, 0)
812 elif s == 'n':
813 return (1, 0)
814 if 'n' not in s:
815
816 return (0, int(s))
817 a, b = s.split('n', 1)
818 if not a:
819 a = 1
820 elif a == '-' or a == '+':
821 a = int(a+'1')
822 else:
823 a = int(a)
824 if not b:
825 b = 0
826 elif b == '-' or b == '+':
827 b = int(b+'1')
828 else:
829 b = int(b)
830 return (a, b)
831
832
833
834
835
836
837 _whitespace_re = re.compile(r'\s+', re.UNICODE)
838
839 _comment_re = re.compile(r'/\*.*?\*/', re.DOTALL)
840
841 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
842
844 pos = 0
845 s = _comment_re.sub('', s)
846 while 1:
847 match = _whitespace_re.match(s, pos=pos)
848 if match:
849 preceding_whitespace_pos = pos
850 pos = match.end()
851 else:
852 preceding_whitespace_pos = 0
853 if pos >= len(s):
854 return
855 match = _count_re.match(s, pos=pos)
856 if match and match.group() != 'n':
857 sym = s[pos:match.end()]
858 yield Symbol(sym, pos)
859 pos = match.end()
860 continue
861 c = s[pos]
862 c2 = s[pos:pos+2]
863 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
864 yield Token(c2, pos)
865 pos += 2
866 continue
867 if c in '>+~,.*=[]()|:#':
868 if c in '.#' and preceding_whitespace_pos > 0:
869 yield Token(' ', preceding_whitespace_pos)
870 yield Token(c, pos)
871 pos += 1
872 continue
873 if c == '"' or c == "'":
874
875 old_pos = pos
876 sym, pos = tokenize_escaped_string(s, pos)
877 yield String(sym, old_pos)
878 continue
879 old_pos = pos
880 sym, pos = tokenize_symbol(s, pos)
881 yield Symbol(sym, old_pos)
882 continue
883
884 split_at_string_escapes = re.compile(r'(\\(?:%s))'
885 % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
886 '[^A-Fa-f0-9]'])).split
887
889 substrings = []
890 for substring in split_at_string_escapes(literal):
891 if not substring:
892 continue
893 elif '\\' in substring:
894 if substring[0] == '\\' and len(substring) > 1:
895 substring = substring[1:]
896 if substring[0] in '0123456789ABCDEFabcdef':
897
898 substring = _unichr(int(substring, 16))
899 else:
900 raise SelectorSyntaxError(
901 "Invalid escape sequence %r in string %r"
902 % (substring.split('\\')[1], literal))
903 substrings.append(substring)
904 return ''.join(substrings)
905
907 quote = s[pos]
908 assert quote in ('"', "'")
909 pos = pos+1
910 start = pos
911 while 1:
912 next = s.find(quote, pos)
913 if next == -1:
914 raise SelectorSyntaxError(
915 "Expected closing %s for string in: %r"
916 % (quote, s[start:]))
917 result = s[start:next]
918 if result.endswith('\\'):
919
920 pos = next+1
921 continue
922 if '\\' in result:
923 result = unescape_string_literal(result)
924 return result, next+1
925
926 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
927
929 start = pos
930 match = _illegal_symbol.search(s, pos=pos)
931 if not match:
932
933 return s[start:], len(s)
934 if match.start() == pos:
935 assert 0, (
936 "Unexpected symbol: %r at %s" % (s[pos], pos))
937 if not match:
938 result = s[start:]
939 pos = len(s)
940 else:
941 result = s[start:match.start()]
942 pos = match.start()
943 try:
944 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
945 except UnicodeDecodeError:
946 import sys
947 e = sys.exc_info()[1]
948 raise SelectorSyntaxError(
949 "Bad symbol %r: %s" % (result, e))
950 return result, pos
951
953
954 - def __init__(self, tokens, source=None):
955 self.used = []
956 self.tokens = iter(tokens)
957 self.source = source
958 self.peeked = None
959 self._peeking = False
960 try:
961 self.next_token = self.tokens.next
962 except AttributeError:
963
964 self.next_token = self.tokens.__next__
965
967 if self._peeking:
968 self._peeking = False
969 self.used.append(self.peeked)
970 return self.peeked
971 else:
972 try:
973 next = self.next_token()
974 self.used.append(next)
975 return next
976 except StopIteration:
977 return None
978
981
983 if not self._peeking:
984 try:
985 self.peeked = self.next_token()
986 except StopIteration:
987 return None
988 self._peeking = True
989 return self.peeked
990