Package lxml :: Module cssselect
[frames] | no frames]

Source Code for Module lxml.cssselect

  1  """CSS Selectors based on XPath. 
  2   
  3  This module supports selecting XML/HTML tags based on CSS selectors. 
  4  See the `CSSSelector` class for details. 
  5  """ 
  6   
  7  import re 
  8  from lxml import etree 
  9   
 10  __all__ = ['SelectorSyntaxError', 'ExpressionError', 
 11             'CSSSelector'] 
 12   
13 -class SelectorSyntaxError(SyntaxError):
14 pass
15
16 -class ExpressionError(RuntimeError):
17 pass
18
19 -class CSSSelector(etree.XPath):
20 """A CSS selector. 21 22 Usage:: 23 24 >>> from lxml import etree, cssselect 25 >>> select = cssselect.CSSSelector("a tag > child") 26 27 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 28 >>> [ el.tag for el in select(root) ] 29 ['child'] 30 """
31 - def __init__(self, css):
32 path = css_to_xpath(css) 33 etree.XPath.__init__(self, path) 34 self.css = css
35
36 - def __repr__(self):
37 return '<%s %s for %r>' % ( 38 self.__class__.__name__, 39 hex(abs(id(self)))[2:], 40 self.css)
41 42 ############################## 43 ## Token objects: 44
45 -class _UniToken(unicode):
46 - def __new__(cls, contents, pos):
47 obj = unicode.__new__(cls, contents) 48 obj.pos = pos 49 return obj
50
51 - def __repr__(self):
52 return '%s(%s, %r)' % ( 53 self.__class__.__name__, 54 unicode.__repr__(self), 55 self.pos)
56
57 -class Symbol(_UniToken):
58 pass
59
60 -class String(_UniToken):
61 pass
62
63 -class Token(_UniToken):
64 pass
65 66 ############################################################ 67 ## Parsing 68 ############################################################ 69 70 ############################## 71 ## Syntax objects: 72
73 -class Class(object):
74 """ 75 Represents selector.class_name 76 """ 77
78 - def __init__(self, selector, class_name):
79 self.selector = selector 80 self.class_name = class_name
81
82 - def __repr__(self):
83 return '%s[%r.%s]' % ( 84 self.__class__.__name__, 85 self.selector, 86 self.class_name)
87
88 - def xpath(self):
89 sel_xpath = self.selector.xpath() 90 sel_xpath.add_condition( 91 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) 92 return sel_xpath
93
94 -class Function(object):
95 """ 96 Represents selector:name(expr) 97 """ 98 99 unsupported = [ 100 'target', 'lang', 'enabled', 'disabled',] 101
102 - def __init__(self, selector, type, name, expr):
103 self.selector = selector 104 self.type = type 105 self.name = name 106 self.expr = expr
107
108 - def __repr__(self):
109 return '%s[%r%s%s(%r)]' % ( 110 self.__class__.__name__, 111 self.selector, 112 self.type, self.name, self.expr)
113
114 - def xpath(self):
115 sel_path = self.selector.xpath() 116 if self.name in self.unsupported: 117 raise ExpressionError( 118 "The psuedo-class %r is not supported" % self.name) 119 method = '_xpath_' + self.name.replace('-', '_') 120 if not hasattr(self, method): 121 raise ExpressionError( 122 "The psuedo-class %r is unknown" % self.name) 123 method = getattr(self, method) 124 return method(sel_path, self.expr)
125
126 - def _xpath_nth_child(self, xpath, expr, last=False, 127 add_name_test=True):
128 a, b = parse_series(expr) 129 if not a and not b and not last: 130 # a=0 means nothing is returned... 131 xpath.add_condition('false() and position() = 0') 132 return xpath 133 if add_name_test: 134 xpath.add_name_test() 135 xpath.add_star_prefix() 136 if a == 0: 137 if last: 138 b = 'last() - %s' % b 139 xpath.add_condition('position() = %s' % b) 140 return xpath 141 if last: 142 # FIXME: I'm not sure if this is right 143 a = -a 144 b = -b 145 if b > 0: 146 b_neg = str(-b) 147 else: 148 b_neg = '+%s' % (-b) 149 if a != 1: 150 expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 151 else: 152 expr = [] 153 if b >= 0: 154 expr.append('position() >= %s' % b) 155 elif b < 0 and last: 156 expr.append('position() < (last() %s)' % b) 157 expr = ' and '.join(expr) 158 if expr: 159 xpath.add_condition(expr) 160 return xpath
161 # FIXME: handle an+b, odd, even 162 # an+b means every-a, plus b, e.g., 2n+1 means odd 163 # 0n+b means b 164 # n+0 means a=1, i.e., all elements 165 # an means every a elements, i.e., 2n means even 166 # -n means -1n 167 # -1n+6 means elements 6 and previous 168
169 - def _xpath_nth_last_child(self, xpath, expr):
170 return self._xpath_nth_child(xpath, expr, last=True)
171
172 - def _xpath_nth_of_type(self, xpath, expr):
173 if xpath.element == '*': 174 raise NotImplementedError( 175 "*:nth-of-type() is not implemented") 176 return self._xpath_nth_child(xpath, expr, add_name_test=False)
177
178 - def _xpath_nth_last_of_type(self, xpath, expr):
179 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
180
181 - def _xpath_contains(self, xpath, expr):
182 # text content, minus tags, must contain expr 183 if isinstance(expr, Element): 184 expr = expr._format_element() 185 xpath.add_condition('contains(css:lower-case(string(.)), %s)' 186 % xpath_repr(expr.lower())) 187 # FIXME: Currently case insensitive matching doesn't seem to be happening 188 return xpath
189
190 - def _xpath_not(self, xpath, expr):
191 # everything for which not expr applies 192 expr = expr.xpath() 193 cond = expr.condition 194 # FIXME: should I do something about element_path? 195 xpath.add_condition('not(%s)' % cond) 196 return xpath
197
198 -def _make_lower_case(context, s):
199 return s.lower()
200 201 ns = etree.FunctionNamespace('/css/') 202 ns.prefix = 'css' 203 ns['lower-case'] = _make_lower_case 204
205 -class Pseudo(object):
206 """ 207 Represents selector:ident 208 """ 209 210 unsupported = ['indeterminate', 'first-line', 'first-letter', 211 'selection', 'before', 'after', 'link', 'visited', 212 'active', 'focus', 'hover'] 213
214 - def __init__(self, element, type, ident):
215 self.element = element 216 assert type in (':', '::') 217 self.type = type 218 self.ident = ident
219
220 - def __repr__(self):
221 return '%s[%r%s%s]' % ( 222 self.__class__.__name__, 223 self.element, 224 self.type, self.ident)
225
226 - def xpath(self):
227 el_xpath = self.element.xpath() 228 if self.ident in self.unsupported: 229 raise ExpressionError( 230 "The psuedo-class %r is unsupported" % self.ident) 231 method = '_xpath_' + self.ident.replace('-', '_') 232 if not hasattr(self, method): 233 raise ExpressionError( 234 "The psuedo-class %r is unknown" % self.ident) 235 method = getattr(self, method) 236 el_xpath = method(el_xpath) 237 return el_xpath
238
239 - def _xpath_checked(self, xpath):
240 # FIXME: is this really all the elements? 241 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 242 return xpath
243
244 - def _xpath_root(self, xpath):
245 # if this element is the root element 246 raise NotImplementedError
247
248 - def _xpath_first_child(self, xpath):
249 xpath.add_star_prefix() 250 xpath.add_name_test() 251 xpath.add_condition('position() = 1') 252 return xpath
253
254 - def _xpath_last_child(self, xpath):
255 xpath.add_star_prefix() 256 xpath.add_name_test() 257 xpath.add_condition('position() = last()') 258 return xpath
259
260 - def _xpath_first_of_type(self, xpath):
261 if xpath.element == '*': 262 raise NotImplementedError( 263 "*:first-of-type is not implemented") 264 xpath.add_star_prefix() 265 xpath.add_condition('position() = 1') 266 return xpath
267
268 - def _xpath_last_of_type(self, xpath):
269 if xpath.element == '*': 270 raise NotImplementedError( 271 "*:last-of-type is not implemented") 272 xpath.add_star_prefix() 273 xpath.add_condition('position() = last()') 274 return xpath
275
276 - def _xpath_only_child(self, xpath):
277 xpath.add_name_test() 278 xpath.add_star_prefix() 279 xpath.add_condition('last() = 1') 280 return xpath
281
282 - def _xpath_only_of_type(self, xpath):
283 if xpath.element == '*': 284 raise NotImplementedError( 285 "*:only-of-type is not implemented") 286 xpath.add_condition('last() = 1') 287 return xpath
288
289 - def _xpath_empty(self, xpath):
290 xpath.add_condition("not(*) and not(normalize-space())") 291 return xpath
292
293 -class Attrib(object):
294 """ 295 Represents selector[namespace|attrib operator value] 296 """ 297
298 - def __init__(self, selector, namespace, attrib, operator, value):
299 self.selector = selector 300 self.namespace = namespace 301 self.attrib = attrib 302 self.operator = operator 303 self.value = value
304
305 - def __repr__(self):
306 if self.operator == 'exists': 307 return '%s[%r[%s]]' % ( 308 self.__class__.__name__, 309 self.selector, 310 self._format_attrib()) 311 else: 312 return '%s[%r[%s %s %r]]' % ( 313 self.__class__.__name__, 314 self.selector, 315 self._format_attrib(), 316 self.operator, 317 self.value)
318
319 - def _format_attrib(self):
320 if self.namespace == '*': 321 return self.attrib 322 else: 323 return '%s|%s' % (self.namespace, self.attrib)
324
325 - def _xpath_attrib(self):
326 # FIXME: if attrib is *? 327 if self.namespace == '*': 328 return '@' + self.attrib 329 else: 330 return '@%s:%s' % (self.namespace, self.attrib)
331
332 - def xpath(self):
333 path = self.selector.xpath() 334 attrib = self._xpath_attrib() 335 value = self.value 336 if self.operator == 'exists': 337 assert not value 338 path.add_condition(attrib) 339 elif self.operator == '=': 340 path.add_condition('%s = %s' % (attrib, 341 xpath_repr(value))) 342 elif self.operator == '!=': 343 # FIXME: this seems like a weird hack... 344 if value: 345 path.add_condition('not(%s) or %s != %s' 346 % (attrib, attrib, xpath_repr(value))) 347 else: 348 path.add_condition('%s != %s' 349 % (attrib, xpath_repr(value))) 350 #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) 351 elif self.operator == '~=': 352 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) 353 elif self.operator == '|=': 354 # Weird, but true... 355 path.add_condition('%s = %s or starts-with(%s, %s)' % ( 356 attrib, xpath_repr(value), 357 attrib, xpath_repr(value + '-'))) 358 elif self.operator == '^=': 359 path.add_condition('starts-with(%s, %s)' % ( 360 attrib, xpath_repr(value))) 361 elif self.operator == '$=': 362 # Oddly there is a starts-with in XPath 1.0, but not ends-with 363 path.add_condition('substring(%s, string-length(%s)-%s) = %s' 364 % (attrib, attrib, len(value)-1, xpath_repr(value))) 365 elif self.operator == '*=': 366 # FIXME: case sensitive? 367 path.add_condition('contains(%s, %s)' % ( 368 attrib, xpath_repr(value))) 369 else: 370 assert 0, ("Unknown operator: %r" % self.operator) 371 return path
372
373 -class Element(object):
374 """ 375 Represents namespace|element 376 """ 377
378 - def __init__(self, namespace, element):
379 self.namespace = namespace 380 self.element = element
381
382 - def __repr__(self):
383 return '%s[%s]' % ( 384 self.__class__.__name__, 385 self._format_element())
386
387 - def _format_element(self):
388 if self.namespace == '*': 389 return self.element 390 else: 391 return '%s|%s' % (self.namespace, self.element)
392
393 - def xpath(self):
394 if self.namespace == '*': 395 el = self.element.lower() 396 else: 397 # FIXME: Should we lowercase here? 398 el = '%s:%s' % (self.namespace, self.element) 399 return XPathExpr(element=el)
400
401 -class Hash(object):
402 """ 403 Represents selector#id 404 """ 405
406 - def __init__(self, selector, id):
407 self.selector = selector 408 self.id = id
409
410 - def __repr__(self):
411 return '%s[%r#%s]' % ( 412 self.__class__.__name__, 413 self.selector, self.id)
414
415 - def xpath(self):
416 path = self.selector.xpath() 417 path.add_condition('@id = %s' % xpath_repr(self.id)) 418 return path
419
420 -class Or(object):
421
422 - def __init__(self, items):
423 self.items = items
424 - def __repr__(self):
425 return '%s(%r)' % ( 426 self.__class__.__name__, 427 self.items)
428
429 - def xpath(self):
430 paths = [item.xpath() for item in self.items] 431 return XPathExprOr(paths)
432
433 -class CombinedSelector(object):
434 435 _method_mapping = { 436 ' ': 'descendant', 437 '>': 'child', 438 '+': 'direct_adjacent', 439 '~': 'indirect_adjacent', 440 } 441
442 - def __init__(self, selector, combinator, subselector):
443 assert selector is not None 444 self.selector = selector 445 self.combinator = combinator 446 self.subselector = subselector
447
448 - def __repr__(self):
449 if self.combinator == ' ': 450 comb = '<followed>' 451 else: 452 comb = self.combinator 453 return '%s[%r %s %r]' % ( 454 self.__class__.__name__, 455 self.selector, 456 comb, 457 self.subselector)
458
459 - def xpath(self):
460 if self.combinator not in self._method_mapping: 461 raise ExpressionError( 462 "Unknown combinator: %r" % self.combinator) 463 method = '_xpath_' + self._method_mapping[self.combinator] 464 method = getattr(self, method) 465 path = self.selector.xpath() 466 return method(path, self.subselector)
467
468 - def _xpath_descendant(self, xpath, sub):
469 # when sub is a descendant in any way of xpath 470 xpath.join('/descendant::', sub.xpath()) 471 return xpath
472
473 - def _xpath_child(self, xpath, sub):
474 # when sub is an immediate child of xpath 475 xpath.join('/', sub.xpath()) 476 return xpath
477
478 - def _xpath_direct_adjacent(self, xpath, sub):
479 # when sub immediately follows xpath 480 xpath.join('/following-sibling::', sub.xpath()) 481 xpath.add_name_test() 482 xpath.add_condition('position() = 1') 483 return xpath
484
485 - def _xpath_indirect_adjacent(self, xpath, sub):
486 # when sub comes somewhere after xpath as a sibling 487 xpath.join('/following-sibling::', sub.xpath()) 488 return xpath
489 490 ############################## 491 ## XPathExpr objects: 492 493 _el_re = re.compile(r'^\w+\s*$') 494 _id_re = re.compile(r'^(\w*)#(\w+)\s*$') 495 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$') 496
497 -def css_to_xpath(css_expr, prefix='descendant-or-self::'):
498 if isinstance(css_expr, basestring): 499 match = _el_re.search(css_expr) 500 if match is not None: 501 return '%s%s' % (prefix, match.group(0).strip()) 502 match = _id_re.search(css_expr) 503 if match is not None: 504 return "%s%s[@id = '%s']" % ( 505 prefix, match.group(1) or '*', match.group(2)) 506 match = _class_re.search(css_expr) 507 if match is not None: 508 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 509 prefix, match.group(1) or '*', match.group(2)) 510 css_expr = parse(css_expr) 511 expr = css_expr.xpath() 512 assert expr is not None, ( 513 "Got None for xpath expression from %s" % repr(css_expr)) 514 if prefix: 515 expr.add_prefix(prefix) 516 return str(expr)
517
518 -class XPathExpr(object):
519
520 - def __init__(self, prefix=None, path=None, element='*', condition=None, 521 star_prefix=False):
522 self.prefix = prefix 523 self.path = path 524 self.element = element 525 self.condition = condition 526 self.star_prefix = star_prefix
527
528 - def __str__(self):
529 path = '' 530 if self.prefix is not None: 531 path += str(self.prefix) 532 if self.path is not None: 533 path += str(self.path) 534 path += str(self.element) 535 if self.condition: 536 path += '[%s]' % self.condition 537 return path
538
539 - def __repr__(self):
540 return '%s[%s]' % ( 541 self.__class__.__name__, self)
542
543 - def add_condition(self, condition):
544 if self.condition: 545 self.condition = '%s and (%s)' % (self.condition, condition) 546 else: 547 self.condition = condition
548
549 - def add_path(self, part):
550 if self.path is None: 551 self.path = self.element 552 else: 553 self.path += self.element 554 self.element = part
555
556 - def add_prefix(self, prefix):
557 if self.prefix: 558 self.prefix = prefix + self.prefix 559 else: 560 self.prefix = prefix
561
562 - def add_name_test(self):
563 if self.element == '*': 564 # We weren't doing a test anyway 565 return 566 self.add_condition("name() = %s" % xpath_repr(self.element)) 567 self.element = '*'
568
569 - def add_star_prefix(self):
570 """ 571 Adds a /* prefix if there is no prefix. This is when you need 572 to keep context's constrained to a single parent. 573 """ 574 if self.path: 575 self.path += '*/' 576 else: 577 self.path = '*/' 578 self.star_prefix = True
579
580 - def join(self, combiner, other):
581 prefix = str(self) 582 prefix += combiner 583 path = (other.prefix or '') + (other.path or '') 584 # We don't need a star prefix if we are joining to this other 585 # prefix; so we'll get rid of it 586 if other.star_prefix and path == '*/': 587 path = '' 588 self.prefix = prefix 589 self.path = path 590 self.element = other.element 591 self.condition = other.condition
592
593 -class XPathExprOr(XPathExpr):
594 """ 595 Represents |'d expressions. Note that unfortunately it isn't 596 the union, it's the sum, so duplicate elements will appear. 597 """ 598
599 - def __init__(self, items, prefix=None):
600 for item in items: 601 assert item is not None 602 self.items = items 603 self.prefix = prefix
604
605 - def __str__(self):
606 prefix = self.prefix or '' 607 return ' | '.join([prefix + str(i) for i in self.items])
608
609 -def xpath_repr(s):
610 # FIXME: I don't think this is right, but lacking any reasonable 611 # specification on what XPath literals look like (which doesn't seem 612 # to be in the XPath specification) it is hard to do 'right' 613 if isinstance(s, Element): 614 # This is probably a symbol that looks like an expression... 615 s = s._format_element() 616 return repr(str(s))
617 618 ############################## 619 ## Parsing functions 620
621 -def parse(string):
622 stream = TokenStream(tokenize(string)) 623 stream.source = string 624 try: 625 return parse_selector_group(stream) 626 except SelectorSyntaxError, e: 627 e.args = tuple(["%s at %s -> %s" % ( 628 e, stream.used, list(stream))]) 629 raise
630
631 -def parse_selector_group(stream):
632 result = [] 633 while 1: 634 result.append(parse_selector(stream)) 635 if stream.peek() == ',': 636 stream.next() 637 else: 638 break 639 if len(result) == 1: 640 return result[0] 641 else: 642 return Or(result)
643
644 -def parse_selector(stream):
645 result = parse_simple_selector(stream) 646 while 1: 647 peek = stream.peek() 648 if peek == ',' or peek is None: 649 return result 650 elif peek in ('+', '>', '~'): 651 # A combinator 652 combinator = stream.next() 653 else: 654 combinator = ' ' 655 next_selector = parse_simple_selector(stream) 656 result = CombinedSelector(result, combinator, next_selector) 657 return result
658
659 -def parse_simple_selector(stream):
660 peek = stream.peek() 661 if peek != '*' and not isinstance(peek, Symbol): 662 element = namespace = '*' 663 else: 664 next = stream.next() 665 if next != '*' and not isinstance(next, Symbol): 666 raise SelectorSyntaxError( 667 "Expected symbol, got %r" % next) 668 if stream.peek() == '|': 669 namespace = next 670 stream.next() 671 element = stream.next() 672 if element != '*' and not isinstance(next, Symbol): 673 raise SelectorSyntaxError( 674 "Expected symbol, got %r" % next) 675 else: 676 namespace = '*' 677 element = next 678 result = Element(namespace, element) 679 has_hash = False 680 while 1: 681 peek = stream.peek() 682 if peek == '#': 683 if has_hash: 684 # You can't have two hashes 685 # (FIXME: is there some more general rule I'm missing?) 686 break 687 stream.next() 688 result = Hash(result, stream.next()) 689 has_hash = True 690 continue 691 elif peek == '.': 692 stream.next() 693 result = Class(result, stream.next()) 694 continue 695 elif peek == '[': 696 stream.next() 697 result = parse_attrib(result, stream) 698 next = stream.next() 699 if not next == ']': 700 raise SelectorSyntaxError( 701 "] expected, got %r" % next) 702 continue 703 elif peek == ':' or peek == '::': 704 type = stream.next() 705 ident = stream.next() 706 if not isinstance(ident, Symbol): 707 raise SelectorSyntaxError( 708 "Expected symbol, got %r" % ident) 709 if stream.peek() == '(': 710 stream.next() 711 peek = stream.peek() 712 if isinstance(peek, String): 713 selector = stream.next() 714 elif isinstance(peek, Symbol) and is_int(peek): 715 selector = int(stream.next()) 716 else: 717 # FIXME: parse_simple_selector, or selector, or...? 718 selector = parse_simple_selector(stream) 719 next = stream.next() 720 if not next == ')': 721 raise SelectorSyntaxError( 722 "Expected ), got %r and %r" 723 % (next, selector)) 724 result = Function(result, type, ident, selector) 725 else: 726 result = Pseudo(result, type, ident) 727 continue 728 else: 729 if peek == ' ': 730 stream.next() 731 break 732 # FIXME: not sure what "negation" is 733 return result
734
735 -def is_int(v):
736 try: 737 int(v) 738 except ValueError: 739 return False 740 else: 741 return True
742
743 -def parse_attrib(selector, stream):
744 attrib = stream.next() 745 if stream.peek() == '|': 746 namespace = attrib 747 stream.next() 748 attrib = stream.next() 749 else: 750 namespace = '*' 751 if stream.peek() == ']': 752 return Attrib(selector, namespace, attrib, 'exists', None) 753 op = stream.next() 754 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 755 raise SelectorSyntaxError( 756 "Operator expected, got %r" % op) 757 value = stream.next() 758 if not isinstance(value, (Symbol, String)): 759 raise SelectorSyntaxError( 760 "Expected string or symbol, got %r" % value) 761 return Attrib(selector, namespace, attrib, op, value)
762
763 -def parse_series(s):
764 """ 765 Parses things like '1n+2', or 'an+b' generally, returning (a, b) 766 """ 767 if isinstance(s, Element): 768 s = s._format_element() 769 if not s or s == '*': 770 # Happens when there's nothing, which the CSS parser thinks of as * 771 return (0, 0) 772 if isinstance(s, int): 773 # Happens when you just get a number 774 return (0, s) 775 if s == 'odd': 776 return (2, 1) 777 elif s == 'even': 778 return (2, 0) 779 elif s == 'n': 780 return (1, 0) 781 if 'n' not in s: 782 # Just a b 783 return (0, int(s)) 784 a, b = s.split('n', 1) 785 if not a: 786 a = 1 787 elif a == '-' or a == '+': 788 a = int(a+'1') 789 else: 790 a = int(a) 791 if not b: 792 b = 0 793 elif b == '-' or b == '+': 794 b = int(b+'1') 795 else: 796 b = int(b) 797 return (a, b)
798 799 800 ############################################################ 801 ## Tokenizing 802 ############################################################ 803 804 _whitespace_re = re.compile(r'\s+') 805 806 _comment_re = re.compile(r'/\*.*?\*/', re.S) 807 808 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') 809
810 -def tokenize(s):
811 pos = 0 812 s = _comment_re.sub('', s) 813 while 1: 814 match = _whitespace_re.match(s, pos=pos) 815 if match: 816 preceding_whitespace_pos = pos 817 pos = match.end() 818 else: 819 preceding_whitespace_pos = 0 820 if pos >= len(s): 821 return 822 match = _count_re.match(s, pos=pos) 823 if match and match.group() != 'n': 824 sym = s[pos:match.end()] 825 yield Symbol(sym, pos) 826 pos = match.end() 827 continue 828 c = s[pos] 829 c2 = s[pos:pos+2] 830 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 831 yield Token(c2, pos) 832 pos += 2 833 continue 834 if c in '>+~,.*=[]()|:#': 835 if c in '.#' and preceding_whitespace_pos > 0: 836 yield Token(' ', preceding_whitespace_pos) 837 yield Token(c, pos) 838 pos += 1 839 continue 840 if c == '"' or c == "'": 841 # Quoted string 842 old_pos = pos 843 sym, pos = tokenize_escaped_string(s, pos) 844 yield String(sym, old_pos) 845 continue 846 old_pos = pos 847 sym, pos = tokenize_symbol(s, pos) 848 yield Symbol(sym, old_pos) 849 continue
850
851 -def tokenize_escaped_string(s, pos):
852 quote = s[pos] 853 assert quote in ('"', "'") 854 pos = pos+1 855 start = pos 856 while 1: 857 next = s.find(quote, pos) 858 if next == -1: 859 raise SelectorSyntaxError( 860 "Expected closing %s for string in: %r" 861 % (quote, s[start:])) 862 result = s[start:next] 863 try: 864 result = result.decode('unicode_escape') 865 except UnicodeDecodeError: 866 # Probably a hanging \ 867 pos = next+1 868 else: 869 return result, next+1
870 871 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 872
873 -def tokenize_symbol(s, pos):
874 start = pos 875 match = _illegal_symbol.search(s, pos=pos) 876 if not match: 877 # Goes to end of s 878 return s[start:], len(s) 879 if match.start() == pos: 880 assert 0, ( 881 "Unexpected symbol: %r at %s" % (s[pos], pos)) 882 if not match: 883 result = s[start:] 884 pos = len(s) 885 else: 886 result = s[start:match.start()] 887 pos = match.start() 888 try: 889 result = result.decode('unicode_escape') 890 except UnicodeDecodeError, e: 891 raise SelectorSyntaxError( 892 "Bad symbol %r: %s" % (result, e)) 893 return result, pos
894
895 -class TokenStream(object):
896
897 - def __init__(self, tokens, source=None):
898 self.used = [] 899 self.tokens = iter(tokens) 900 self.source = source 901 self.peeked = None 902 self._peeking = False
903
904 - def next(self):
905 if self._peeking: 906 self._peeking = False 907 self.used.append(self.peeked) 908 return self.peeked 909 else: 910 try: 911 next = self.tokens.next() 912 self.used.append(next) 913 return next 914 except StopIteration: 915 return None
916
917 - def __iter__(self):
918 return iter(self.next, None)
919
920 - def peek(self):
921 if not self._peeking: 922 try: 923 self.peeked = self.tokens.next() 924 except StopIteration: 925 return None 926 self._peeking = True 927 return self.peeked
928