Package lxml :: Module cssselect
[hide private]
[frames] | no frames]

Source Code for Module lxml.cssselect

  1  """CSS Selectors based on XPath. 
  2   
  3  This module supports selecting XML/HTML tags based on CSS selectors. 
  4  See the `CSSSelector` class for details. 
  5  """ 
  6   
  7  import re 
  8  from lxml import etree 
  9   
 10  __all__ = ['SelectorSyntaxError', 'ExpressionError', 
 11             'CSSSelector'] 
 12   
 13  try: 
 14      _basestring = basestring 
 15  except NameError: 
 16      _basestring = str 
 17   
18 -class SelectorSyntaxError(SyntaxError):
19 pass
20
21 -class ExpressionError(RuntimeError):
22 pass
23
24 -class CSSSelector(etree.XPath):
25 """A CSS selector. 26 27 Usage:: 28 29 >>> from lxml import etree, cssselect 30 >>> select = cssselect.CSSSelector("a tag > child") 31 32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 33 >>> [ el.tag for el in select(root) ] 34 ['child'] 35 """
36 - def __init__(self, css):
37 path = css_to_xpath(css) 38 etree.XPath.__init__(self, path) 39 self.css = css
40
41 - def __repr__(self):
42 return '<%s %s for %r>' % ( 43 self.__class__.__name__, 44 hex(abs(id(self)))[2:], 45 self.css)
46 47 ############################## 48 ## Token objects: 49 50 try: 51 _unicode = unicode 52 except NameError: 53 # Python 3 54 _unicode = str 55
56 -class _UniToken(_unicode):
57 - def __new__(cls, contents, pos):
58 obj = _unicode.__new__(cls, contents) 59 obj.pos = pos 60 return obj
61
62 - def __repr__(self):
63 return '%s(%s, %r)' % ( 64 self.__class__.__name__, 65 _unicode.__repr__(self), 66 self.pos)
67
68 -class Symbol(_UniToken):
69 pass
70
71 -class String(_UniToken):
72 pass
73
74 -class Token(_UniToken):
75 pass
76 77 ############################################################ 78 ## Parsing 79 ############################################################ 80 81 ############################## 82 ## Syntax objects: 83
84 -class Class(object):
85 """ 86 Represents selector.class_name 87 """ 88
89 - def __init__(self, selector, class_name):
90 self.selector = selector 91 self.class_name = class_name
92
93 - def __repr__(self):
94 return '%s[%r.%s]' % ( 95 self.__class__.__name__, 96 self.selector, 97 self.class_name)
98
99 - def xpath(self):
100 sel_xpath = self.selector.xpath() 101 sel_xpath.add_condition( 102 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) 103 return sel_xpath
104
105 -class Function(object):
106 """ 107 Represents selector:name(expr) 108 """ 109 110 unsupported = [ 111 'target', 'lang', 'enabled', 'disabled',] 112
113 - def __init__(self, selector, type, name, expr):
114 self.selector = selector 115 self.type = type 116 self.name = name 117 self.expr = expr
118
119 - def __repr__(self):
120 return '%s[%r%s%s(%r)]' % ( 121 self.__class__.__name__, 122 self.selector, 123 self.type, self.name, self.expr)
124
125 - def xpath(self):
126 sel_path = self.selector.xpath() 127 if self.name in self.unsupported: 128 raise ExpressionError( 129 "The psuedo-class %r is not supported" % self.name) 130 method = '_xpath_' + self.name.replace('-', '_') 131 if not hasattr(self, method): 132 raise ExpressionError( 133 "The psuedo-class %r is unknown" % self.name) 134 method = getattr(self, method) 135 return method(sel_path, self.expr)
136
137 - def _xpath_nth_child(self, xpath, expr, last=False, 138 add_name_test=True):
139 a, b = parse_series(expr) 140 if not a and not b and not last: 141 # a=0 means nothing is returned... 142 xpath.add_condition('false() and position() = 0') 143 return xpath 144 if add_name_test: 145 xpath.add_name_test() 146 xpath.add_star_prefix() 147 if a == 0: 148 if last: 149 b = 'last() - %s' % b 150 xpath.add_condition('position() = %s' % b) 151 return xpath 152 if last: 153 # FIXME: I'm not sure if this is right 154 a = -a 155 b = -b 156 if b > 0: 157 b_neg = str(-b) 158 else: 159 b_neg = '+%s' % (-b) 160 if a != 1: 161 expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 162 else: 163 expr = [] 164 if b >= 0: 165 expr.append('position() >= %s' % b) 166 elif b < 0 and last: 167 expr.append('position() < (last() %s)' % b) 168 expr = ' and '.join(expr) 169 if expr: 170 xpath.add_condition(expr) 171 return xpath
172 # FIXME: handle an+b, odd, even 173 # an+b means every-a, plus b, e.g., 2n+1 means odd 174 # 0n+b means b 175 # n+0 means a=1, i.e., all elements 176 # an means every a elements, i.e., 2n means even 177 # -n means -1n 178 # -1n+6 means elements 6 and previous 179
180 - def _xpath_nth_last_child(self, xpath, expr):
181 return self._xpath_nth_child(xpath, expr, last=True)
182
183 - def _xpath_nth_of_type(self, xpath, expr):
184 if xpath.element == '*': 185 raise NotImplementedError( 186 "*:nth-of-type() is not implemented") 187 return self._xpath_nth_child(xpath, expr, add_name_test=False)
188
189 - def _xpath_nth_last_of_type(self, xpath, expr):
190 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
191
192 - def _xpath_contains(self, xpath, expr):
193 # text content, minus tags, must contain expr 194 if isinstance(expr, Element): 195 expr = expr._format_element() 196 xpath.add_condition('contains(css:lower-case(string(.)), %s)' 197 % xpath_repr(expr.lower())) 198 # FIXME: Currently case insensitive matching doesn't seem to be happening 199 return xpath
200
201 - def _xpath_not(self, xpath, expr):
202 # everything for which not expr applies 203 expr = expr.xpath() 204 cond = expr.condition 205 # FIXME: should I do something about element_path? 206 xpath.add_condition('not(%s)' % cond) 207 return xpath
208
209 -def _make_lower_case(context, s):
210 return s.lower()
211 212 ns = etree.FunctionNamespace('/css/') 213 ns.prefix = 'css' 214 ns['lower-case'] = _make_lower_case 215
216 -class Pseudo(object):
217 """ 218 Represents selector:ident 219 """ 220 221 unsupported = ['indeterminate', 'first-line', 'first-letter', 222 'selection', 'before', 'after', 'link', 'visited', 223 'active', 'focus', 'hover'] 224
225 - def __init__(self, element, type, ident):
226 self.element = element 227 assert type in (':', '::') 228 self.type = type 229 self.ident = ident
230
231 - def __repr__(self):
232 return '%s[%r%s%s]' % ( 233 self.__class__.__name__, 234 self.element, 235 self.type, self.ident)
236
237 - def xpath(self):
238 el_xpath = self.element.xpath() 239 if self.ident in self.unsupported: 240 raise ExpressionError( 241 "The psuedo-class %r is unsupported" % self.ident) 242 method = '_xpath_' + self.ident.replace('-', '_') 243 if not hasattr(self, method): 244 raise ExpressionError( 245 "The psuedo-class %r is unknown" % self.ident) 246 method = getattr(self, method) 247 el_xpath = method(el_xpath) 248 return el_xpath
249
250 - def _xpath_checked(self, xpath):
251 # FIXME: is this really all the elements? 252 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 253 return xpath
254
255 - def _xpath_root(self, xpath):
256 # if this element is the root element 257 raise NotImplementedError
258
259 - def _xpath_first_child(self, xpath):
260 xpath.add_star_prefix() 261 xpath.add_name_test() 262 xpath.add_condition('position() = 1') 263 return xpath
264
265 - def _xpath_last_child(self, xpath):
266 xpath.add_star_prefix() 267 xpath.add_name_test() 268 xpath.add_condition('position() = last()') 269 return xpath
270
271 - def _xpath_first_of_type(self, xpath):
272 if xpath.element == '*': 273 raise NotImplementedError( 274 "*:first-of-type is not implemented") 275 xpath.add_star_prefix() 276 xpath.add_condition('position() = 1') 277 return xpath
278
279 - def _xpath_last_of_type(self, xpath):
280 if xpath.element == '*': 281 raise NotImplementedError( 282 "*:last-of-type is not implemented") 283 xpath.add_star_prefix() 284 xpath.add_condition('position() = last()') 285 return xpath
286
287 - def _xpath_only_child(self, xpath):
288 xpath.add_name_test() 289 xpath.add_star_prefix() 290 xpath.add_condition('last() = 1') 291 return xpath
292
293 - def _xpath_only_of_type(self, xpath):
294 if xpath.element == '*': 295 raise NotImplementedError( 296 "*:only-of-type is not implemented") 297 xpath.add_condition('last() = 1') 298 return xpath
299
300 - def _xpath_empty(self, xpath):
301 xpath.add_condition("not(*) and not(normalize-space())") 302 return xpath
303
304 -class Attrib(object):
305 """ 306 Represents selector[namespace|attrib operator value] 307 """ 308
309 - def __init__(self, selector, namespace, attrib, operator, value):
310 self.selector = selector 311 self.namespace = namespace 312 self.attrib = attrib 313 self.operator = operator 314 self.value = value
315
316 - def __repr__(self):
317 if self.operator == 'exists': 318 return '%s[%r[%s]]' % ( 319 self.__class__.__name__, 320 self.selector, 321 self._format_attrib()) 322 else: 323 return '%s[%r[%s %s %r]]' % ( 324 self.__class__.__name__, 325 self.selector, 326 self._format_attrib(), 327 self.operator, 328 self.value)
329
330 - def _format_attrib(self):
331 if self.namespace == '*': 332 return self.attrib 333 else: 334 return '%s|%s' % (self.namespace, self.attrib)
335
336 - def _xpath_attrib(self):
337 # FIXME: if attrib is *? 338 if self.namespace == '*': 339 return '@' + self.attrib 340 else: 341 return '@%s:%s' % (self.namespace, self.attrib)
342
343 - def xpath(self):
344 path = self.selector.xpath() 345 attrib = self._xpath_attrib() 346 value = self.value 347 if self.operator == 'exists': 348 assert not value 349 path.add_condition(attrib) 350 elif self.operator == '=': 351 path.add_condition('%s = %s' % (attrib, 352 xpath_repr(value))) 353 elif self.operator == '!=': 354 # FIXME: this seems like a weird hack... 355 if value: 356 path.add_condition('not(%s) or %s != %s' 357 % (attrib, attrib, xpath_repr(value))) 358 else: 359 path.add_condition('%s != %s' 360 % (attrib, xpath_repr(value))) 361 #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) 362 elif self.operator == '~=': 363 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) 364 elif self.operator == '|=': 365 # Weird, but true... 366 path.add_condition('%s = %s or starts-with(%s, %s)' % ( 367 attrib, xpath_repr(value), 368 attrib, xpath_repr(value + '-'))) 369 elif self.operator == '^=': 370 path.add_condition('starts-with(%s, %s)' % ( 371 attrib, xpath_repr(value))) 372 elif self.operator == '$=': 373 # Oddly there is a starts-with in XPath 1.0, but not ends-with 374 path.add_condition('substring(%s, string-length(%s)-%s) = %s' 375 % (attrib, attrib, len(value)-1, xpath_repr(value))) 376 elif self.operator == '*=': 377 # FIXME: case sensitive? 378 path.add_condition('contains(%s, %s)' % ( 379 attrib, xpath_repr(value))) 380 else: 381 assert 0, ("Unknown operator: %r" % self.operator) 382 return path
383
384 -class Element(object):
385 """ 386 Represents namespace|element 387 """ 388
389 - def __init__(self, namespace, element):
390 self.namespace = namespace 391 self.element = element
392
393 - def __repr__(self):
394 return '%s[%s]' % ( 395 self.__class__.__name__, 396 self._format_element())
397
398 - def _format_element(self):
399 if self.namespace == '*': 400 return self.element 401 else: 402 return '%s|%s' % (self.namespace, self.element)
403
404 - def xpath(self):
405 if self.namespace == '*': 406 el = self.element.lower() 407 else: 408 # FIXME: Should we lowercase here? 409 el = '%s:%s' % (self.namespace, self.element) 410 return XPathExpr(element=el)
411
412 -class Hash(object):
413 """ 414 Represents selector#id 415 """ 416
417 - def __init__(self, selector, id):
418 self.selector = selector 419 self.id = id
420
421 - def __repr__(self):
422 return '%s[%r#%s]' % ( 423 self.__class__.__name__, 424 self.selector, self.id)
425
426 - def xpath(self):
427 path = self.selector.xpath() 428 path.add_condition('@id = %s' % xpath_repr(self.id)) 429 return path
430
431 -class Or(object):
432
433 - def __init__(self, items):
434 self.items = items
435 - def __repr__(self):
436 return '%s(%r)' % ( 437 self.__class__.__name__, 438 self.items)
439
440 - def xpath(self):
441 paths = [item.xpath() for item in self.items] 442 return XPathExprOr(paths)
443
444 -class CombinedSelector(object):
445 446 _method_mapping = { 447 ' ': 'descendant', 448 '>': 'child', 449 '+': 'direct_adjacent', 450 '~': 'indirect_adjacent', 451 } 452
453 - def __init__(self, selector, combinator, subselector):
454 assert selector is not None 455 self.selector = selector 456 self.combinator = combinator 457 self.subselector = subselector
458
459 - def __repr__(self):
460 if self.combinator == ' ': 461 comb = '<followed>' 462 else: 463 comb = self.combinator 464 return '%s[%r %s %r]' % ( 465 self.__class__.__name__, 466 self.selector, 467 comb, 468 self.subselector)
469
470 - def xpath(self):
471 if self.combinator not in self._method_mapping: 472 raise ExpressionError( 473 "Unknown combinator: %r" % self.combinator) 474 method = '_xpath_' + self._method_mapping[self.combinator] 475 method = getattr(self, method) 476 path = self.selector.xpath() 477 return method(path, self.subselector)
478
479 - def _xpath_descendant(self, xpath, sub):
480 # when sub is a descendant in any way of xpath 481 xpath.join('/descendant::', sub.xpath()) 482 return xpath
483
484 - def _xpath_child(self, xpath, sub):
485 # when sub is an immediate child of xpath 486 xpath.join('/', sub.xpath()) 487 return xpath
488
489 - def _xpath_direct_adjacent(self, xpath, sub):
490 # when sub immediately follows xpath 491 xpath.join('/following-sibling::', sub.xpath()) 492 xpath.add_name_test() 493 xpath.add_condition('position() = 1') 494 return xpath
495
496 - def _xpath_indirect_adjacent(self, xpath, sub):
497 # when sub comes somewhere after xpath as a sibling 498 xpath.join('/following-sibling::', sub.xpath()) 499 return xpath
500 501 ############################## 502 ## XPathExpr objects: 503 504 _el_re = re.compile(r'^\w+\s*$') 505 _id_re = re.compile(r'^(\w*)#(\w+)\s*$') 506 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$') 507
508 -def css_to_xpath(css_expr, prefix='descendant-or-self::'):
509 if isinstance(css_expr, _basestring): 510 match = _el_re.search(css_expr) 511 if match is not None: 512 return '%s%s' % (prefix, match.group(0).strip()) 513 match = _id_re.search(css_expr) 514 if match is not None: 515 return "%s%s[@id = '%s']" % ( 516 prefix, match.group(1) or '*', match.group(2)) 517 match = _class_re.search(css_expr) 518 if match is not None: 519 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 520 prefix, match.group(1) or '*', match.group(2)) 521 css_expr = parse(css_expr) 522 expr = css_expr.xpath() 523 assert expr is not None, ( 524 "Got None for xpath expression from %s" % repr(css_expr)) 525 if prefix: 526 expr.add_prefix(prefix) 527 return str(expr)
528
529 -class XPathExpr(object):
530
531 - def __init__(self, prefix=None, path=None, element='*', condition=None, 532 star_prefix=False):
533 self.prefix = prefix 534 self.path = path 535 self.element = element 536 self.condition = condition 537 self.star_prefix = star_prefix
538
539 - def __str__(self):
540 path = '' 541 if self.prefix is not None: 542 path += str(self.prefix) 543 if self.path is not None: 544 path += str(self.path) 545 path += str(self.element) 546 if self.condition: 547 path += '[%s]' % self.condition 548 return path
549
550 - def __repr__(self):
551 return '%s[%s]' % ( 552 self.__class__.__name__, self)
553
554 - def add_condition(self, condition):
555 if self.condition: 556 self.condition = '%s and (%s)' % (self.condition, condition) 557 else: 558 self.condition = condition
559
560 - def add_path(self, part):
561 if self.path is None: 562 self.path = self.element 563 else: 564 self.path += self.element 565 self.element = part
566
567 - def add_prefix(self, prefix):
568 if self.prefix: 569 self.prefix = prefix + self.prefix 570 else: 571 self.prefix = prefix
572
573 - def add_name_test(self):
574 if self.element == '*': 575 # We weren't doing a test anyway 576 return 577 self.add_condition("name() = %s" % xpath_repr(self.element)) 578 self.element = '*'
579
580 - def add_star_prefix(self):
581 """ 582 Adds a /* prefix if there is no prefix. This is when you need 583 to keep context's constrained to a single parent. 584 """ 585 if self.path: 586 self.path += '*/' 587 else: 588 self.path = '*/' 589 self.star_prefix = True
590
591 - def join(self, combiner, other):
592 prefix = str(self) 593 prefix += combiner 594 path = (other.prefix or '') + (other.path or '') 595 # We don't need a star prefix if we are joining to this other 596 # prefix; so we'll get rid of it 597 if other.star_prefix and path == '*/': 598 path = '' 599 self.prefix = prefix 600 self.path = path 601 self.element = other.element 602 self.condition = other.condition
603
604 -class XPathExprOr(XPathExpr):
605 """ 606 Represents |'d expressions. Note that unfortunately it isn't 607 the union, it's the sum, so duplicate elements will appear. 608 """ 609
610 - def __init__(self, items, prefix=None):
611 for item in items: 612 assert item is not None 613 self.items = items 614 self.prefix = prefix
615
616 - def __str__(self):
617 prefix = self.prefix or '' 618 return ' | '.join([prefix + str(i) for i in self.items])
619
620 -def xpath_repr(s):
621 # FIXME: I don't think this is right, but lacking any reasonable 622 # specification on what XPath literals look like (which doesn't seem 623 # to be in the XPath specification) it is hard to do 'right' 624 if isinstance(s, Element): 625 # This is probably a symbol that looks like an expression... 626 s = s._format_element() 627 return repr(str(s))
628 629 ############################## 630 ## Parsing functions 631
632 -def parse(string):
633 stream = TokenStream(tokenize(string)) 634 stream.source = string 635 try: 636 return parse_selector_group(stream) 637 except SelectorSyntaxError: 638 import sys 639 e = sys.exc_info()[1] 640 e.args = tuple(["%s at %s -> %s" % ( 641 e, stream.used, list(stream))]) 642 raise
643
644 -def parse_selector_group(stream):
645 result = [] 646 while 1: 647 result.append(parse_selector(stream)) 648 if stream.peek() == ',': 649 stream.next() 650 else: 651 break 652 if len(result) == 1: 653 return result[0] 654 else: 655 return Or(result)
656
657 -def parse_selector(stream):
658 result = parse_simple_selector(stream) 659 while 1: 660 peek = stream.peek() 661 if peek == ',' or peek is None: 662 return result 663 elif peek in ('+', '>', '~'): 664 # A combinator 665 combinator = stream.next() 666 else: 667 combinator = ' ' 668 next_selector = parse_simple_selector(stream) 669 result = CombinedSelector(result, combinator, next_selector) 670 return result
671
672 -def parse_simple_selector(stream):
673 peek = stream.peek() 674 if peek != '*' and not isinstance(peek, Symbol): 675 element = namespace = '*' 676 else: 677 next = stream.next() 678 if next != '*' and not isinstance(next, Symbol): 679 raise SelectorSyntaxError( 680 "Expected symbol, got %r" % next) 681 if stream.peek() == '|': 682 namespace = next 683 stream.next() 684 element = stream.next() 685 if element != '*' and not isinstance(next, Symbol): 686 raise SelectorSyntaxError( 687 "Expected symbol, got %r" % next) 688 else: 689 namespace = '*' 690 element = next 691 result = Element(namespace, element) 692 has_hash = False 693 while 1: 694 peek = stream.peek() 695 if peek == '#': 696 if has_hash: 697 # You can't have two hashes 698 # (FIXME: is there some more general rule I'm missing?) 699 break 700 stream.next() 701 result = Hash(result, stream.next()) 702 has_hash = True 703 continue 704 elif peek == '.': 705 stream.next() 706 result = Class(result, stream.next()) 707 continue 708 elif peek == '[': 709 stream.next() 710 result = parse_attrib(result, stream) 711 next = stream.next() 712 if not next == ']': 713 raise SelectorSyntaxError( 714 "] expected, got %r" % next) 715 continue 716 elif peek == ':' or peek == '::': 717 type = stream.next() 718 ident = stream.next() 719 if not isinstance(ident, Symbol): 720 raise SelectorSyntaxError( 721 "Expected symbol, got %r" % ident) 722 if stream.peek() == '(': 723 stream.next() 724 peek = stream.peek() 725 if isinstance(peek, String): 726 selector = stream.next() 727 elif isinstance(peek, Symbol) and is_int(peek): 728 selector = int(stream.next()) 729 else: 730 # FIXME: parse_simple_selector, or selector, or...? 731 selector = parse_simple_selector(stream) 732 next = stream.next() 733 if not next == ')': 734 raise SelectorSyntaxError( 735 "Expected ), got %r and %r" 736 % (next, selector)) 737 result = Function(result, type, ident, selector) 738 else: 739 result = Pseudo(result, type, ident) 740 continue 741 else: 742 if peek == ' ': 743 stream.next() 744 break 745 # FIXME: not sure what "negation" is 746 return result
747
748 -def is_int(v):
749 try: 750 int(v) 751 except ValueError: 752 return False 753 else: 754 return True
755
756 -def parse_attrib(selector, stream):
757 attrib = stream.next() 758 if stream.peek() == '|': 759 namespace = attrib 760 stream.next() 761 attrib = stream.next() 762 else: 763 namespace = '*' 764 if stream.peek() == ']': 765 return Attrib(selector, namespace, attrib, 'exists', None) 766 op = stream.next() 767 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 768 raise SelectorSyntaxError( 769 "Operator expected, got %r" % op) 770 value = stream.next() 771 if not isinstance(value, (Symbol, String)): 772 raise SelectorSyntaxError( 773 "Expected string or symbol, got %r" % value) 774 return Attrib(selector, namespace, attrib, op, value)
775
776 -def parse_series(s):
777 """ 778 Parses things like '1n+2', or 'an+b' generally, returning (a, b) 779 """ 780 if isinstance(s, Element): 781 s = s._format_element() 782 if not s or s == '*': 783 # Happens when there's nothing, which the CSS parser thinks of as * 784 return (0, 0) 785 if isinstance(s, int): 786 # Happens when you just get a number 787 return (0, s) 788 if s == 'odd': 789 return (2, 1) 790 elif s == 'even': 791 return (2, 0) 792 elif s == 'n': 793 return (1, 0) 794 if 'n' not in s: 795 # Just a b 796 return (0, int(s)) 797 a, b = s.split('n', 1) 798 if not a: 799 a = 1 800 elif a == '-' or a == '+': 801 a = int(a+'1') 802 else: 803 a = int(a) 804 if not b: 805 b = 0 806 elif b == '-' or b == '+': 807 b = int(b+'1') 808 else: 809 b = int(b) 810 return (a, b)
811 812 813 ############################################################ 814 ## Tokenizing 815 ############################################################ 816 817 _whitespace_re = re.compile(r'\s+') 818 819 _comment_re = re.compile(r'/\*.*?\*/', re.S) 820 821 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') 822
823 -def tokenize(s):
824 pos = 0 825 s = _comment_re.sub('', s) 826 while 1: 827 match = _whitespace_re.match(s, pos=pos) 828 if match: 829 preceding_whitespace_pos = pos 830 pos = match.end() 831 else: 832 preceding_whitespace_pos = 0 833 if pos >= len(s): 834 return 835 match = _count_re.match(s, pos=pos) 836 if match and match.group() != 'n': 837 sym = s[pos:match.end()] 838 yield Symbol(sym, pos) 839 pos = match.end() 840 continue 841 c = s[pos] 842 c2 = s[pos:pos+2] 843 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 844 yield Token(c2, pos) 845 pos += 2 846 continue 847 if c in '>+~,.*=[]()|:#': 848 if c in '.#' and preceding_whitespace_pos > 0: 849 yield Token(' ', preceding_whitespace_pos) 850 yield Token(c, pos) 851 pos += 1 852 continue 853 if c == '"' or c == "'": 854 # Quoted string 855 old_pos = pos 856 sym, pos = tokenize_escaped_string(s, pos) 857 yield String(sym, old_pos) 858 continue 859 old_pos = pos 860 sym, pos = tokenize_symbol(s, pos) 861 yield Symbol(sym, old_pos) 862 continue
863
864 -def tokenize_escaped_string(s, pos):
865 quote = s[pos] 866 assert quote in ('"', "'") 867 pos = pos+1 868 start = pos 869 while 1: 870 next = s.find(quote, pos) 871 if next == -1: 872 raise SelectorSyntaxError( 873 "Expected closing %s for string in: %r" 874 % (quote, s[start:])) 875 result = s[start:next] 876 try: 877 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 878 except UnicodeDecodeError: 879 # Probably a hanging \ 880 pos = next+1 881 else: 882 return result, next+1
883 884 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 885
886 -def tokenize_symbol(s, pos):
887 start = pos 888 match = _illegal_symbol.search(s, pos=pos) 889 if not match: 890 # Goes to end of s 891 return s[start:], len(s) 892 if match.start() == pos: 893 assert 0, ( 894 "Unexpected symbol: %r at %s" % (s[pos], pos)) 895 if not match: 896 result = s[start:] 897 pos = len(s) 898 else: 899 result = s[start:match.start()] 900 pos = match.start() 901 try: 902 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 903 except UnicodeDecodeError: 904 import sys 905 e = sys.exc_info()[1] 906 raise SelectorSyntaxError( 907 "Bad symbol %r: %s" % (result, e)) 908 return result, pos
909
910 -class TokenStream(object):
911
912 - def __init__(self, tokens, source=None):
913 self.used = [] 914 self.tokens = iter(tokens) 915 self.source = source 916 self.peeked = None 917 self._peeking = False 918 try: 919 self.next_token = self.tokens.next 920 except AttributeError: 921 # Python 3 922 self.next_token = self.tokens.__next__
923
924 - def next(self):
925 if self._peeking: 926 self._peeking = False 927 self.used.append(self.peeked) 928 return self.peeked 929 else: 930 try: 931 next = self.next_token() 932 self.used.append(next) 933 return next 934 except StopIteration: 935 return None
936
937 - def __iter__(self):
938 return iter(self.next, None)
939
940 - def peek(self):
941 if not self._peeking: 942 try: 943 self.peeked = self.next_token() 944 except StopIteration: 945 return None 946 self._peeking = True 947 return self.peeked
948