Package lxml :: Module cssselect
[hide private]
[frames] | no frames]

Source Code for Module lxml.cssselect

  1  """CSS Selectors based on XPath. 
  2   
  3  This module supports selecting XML/HTML tags based on CSS selectors. 
  4  See the `CSSSelector` class for details. 
  5  """ 
  6   
  7  import re 
  8  from lxml import etree 
  9   
 10  __all__ = ['SelectorSyntaxError', 'ExpressionError', 
 11             'CSSSelector'] 
 12   
 13  try: 
 14      _basestring = basestring 
 15  except NameError: 
 16      _basestring = str 
 17   
18 -class SelectorSyntaxError(SyntaxError):
19 pass
20
21 -class ExpressionError(RuntimeError):
22 pass
23
24 -class CSSSelector(etree.XPath):
25 """A CSS selector. 26 27 Usage:: 28 29 >>> from lxml import etree, cssselect 30 >>> select = cssselect.CSSSelector("a tag > child") 31 32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 33 >>> [ el.tag for el in select(root) ] 34 ['child'] 35 """
36 - def __init__(self, css):
37 path = css_to_xpath(css) 38 etree.XPath.__init__(self, path) 39 self.css = css
40
41 - def __repr__(self):
42 return '<%s %s for %r>' % ( 43 self.__class__.__name__, 44 hex(abs(id(self)))[2:], 45 self.css)
46 47 ############################## 48 ## Token objects: 49 50 try: 51 _unicode = unicode 52 _unichr = unichr 53 except NameError: 54 # Python 3 55 _unicode = str 56 _unichr = chr 57
58 -class _UniToken(_unicode):
59 - def __new__(cls, contents, pos):
60 obj = _unicode.__new__(cls, contents) 61 obj.pos = pos 62 return obj
63
64 - def __repr__(self):
65 return '%s(%s, %r)' % ( 66 self.__class__.__name__, 67 _unicode.__repr__(self), 68 self.pos)
69
70 -class Symbol(_UniToken):
71 pass
72
73 -class String(_UniToken):
74 pass
75
76 -class Token(_UniToken):
77 pass
78 79 ############################################################ 80 ## Parsing 81 ############################################################ 82 83 ############################## 84 ## Syntax objects: 85
86 -class Class(object):
87 """ 88 Represents selector.class_name 89 """ 90
91 - def __init__(self, selector, class_name):
92 self.selector = selector 93 self.class_name = class_name
94
95 - def __repr__(self):
96 return '%s[%r.%s]' % ( 97 self.__class__.__name__, 98 self.selector, 99 self.class_name)
100
101 - def xpath(self):
102 sel_xpath = self.selector.xpath() 103 sel_xpath.add_condition( 104 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) 105 return sel_xpath
106
107 -class Function(object):
108 """ 109 Represents selector:name(expr) 110 """ 111 112 unsupported = [ 113 'target', 'lang', 'enabled', 'disabled',] 114
115 - def __init__(self, selector, type, name, expr):
116 self.selector = selector 117 self.type = type 118 self.name = name 119 self.expr = expr
120
121 - def __repr__(self):
122 return '%s[%r%s%s(%r)]' % ( 123 self.__class__.__name__, 124 self.selector, 125 self.type, self.name, self.expr)
126
127 - def xpath(self):
128 sel_path = self.selector.xpath() 129 if self.name in self.unsupported: 130 raise ExpressionError( 131 "The psuedo-class %r is not supported" % self.name) 132 method = '_xpath_' + self.name.replace('-', '_') 133 if not hasattr(self, method): 134 raise ExpressionError( 135 "The psuedo-class %r is unknown" % self.name) 136 method = getattr(self, method) 137 return method(sel_path, self.expr)
138
139 - def _xpath_nth_child(self, xpath, expr, last=False, 140 add_name_test=True):
141 a, b = parse_series(expr) 142 if not a and not b and not last: 143 # a=0 means nothing is returned... 144 xpath.add_condition('false() and position() = 0') 145 return xpath 146 if add_name_test: 147 xpath.add_name_test() 148 xpath.add_star_prefix() 149 if a == 0: 150 if last: 151 b = 'last() - %s' % b 152 xpath.add_condition('position() = %s' % b) 153 return xpath 154 if last: 155 # FIXME: I'm not sure if this is right 156 a = -a 157 b = -b 158 if b > 0: 159 b_neg = str(-b) 160 else: 161 b_neg = '+%s' % (-b) 162 if a != 1: 163 expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 164 else: 165 expr = [] 166 if b >= 0: 167 expr.append('position() >= %s' % b) 168 elif b < 0 and last: 169 expr.append('position() < (last() %s)' % b) 170 expr = ' and '.join(expr) 171 if expr: 172 xpath.add_condition(expr) 173 return xpath
174 # FIXME: handle an+b, odd, even 175 # an+b means every-a, plus b, e.g., 2n+1 means odd 176 # 0n+b means b 177 # n+0 means a=1, i.e., all elements 178 # an means every a elements, i.e., 2n means even 179 # -n means -1n 180 # -1n+6 means elements 6 and previous 181
182 - def _xpath_nth_last_child(self, xpath, expr):
183 return self._xpath_nth_child(xpath, expr, last=True)
184
185 - def _xpath_nth_of_type(self, xpath, expr):
186 if xpath.element == '*': 187 raise NotImplementedError( 188 "*:nth-of-type() is not implemented") 189 return self._xpath_nth_child(xpath, expr, add_name_test=False)
190
191 - def _xpath_nth_last_of_type(self, xpath, expr):
192 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
193
194 - def _xpath_contains(self, xpath, expr):
195 # text content, minus tags, must contain expr 196 if isinstance(expr, Element): 197 expr = expr._format_element() 198 xpath.add_condition('contains(css:lower-case(string(.)), %s)' 199 % xpath_literal(expr.lower())) 200 # FIXME: Currently case insensitive matching doesn't seem to be happening 201 return xpath
202
203 - def _xpath_not(self, xpath, expr):
204 # everything for which not expr applies 205 expr = expr.xpath() 206 cond = expr.condition 207 # FIXME: should I do something about element_path? 208 xpath.add_condition('not(%s)' % cond) 209 return xpath
210
211 -def _make_lower_case(context, s):
212 return s.lower()
213 214 ns = etree.FunctionNamespace('/css/') 215 ns.prefix = 'css' 216 ns['lower-case'] = _make_lower_case 217
218 -class Pseudo(object):
219 """ 220 Represents selector:ident 221 """ 222 223 unsupported = ['indeterminate', 'first-line', 'first-letter', 224 'selection', 'before', 'after', 'link', 'visited', 225 'active', 'focus', 'hover'] 226
227 - def __init__(self, element, type, ident):
228 self.element = element 229 assert type in (':', '::') 230 self.type = type 231 self.ident = ident
232
233 - def __repr__(self):
234 return '%s[%r%s%s]' % ( 235 self.__class__.__name__, 236 self.element, 237 self.type, self.ident)
238
239 - def xpath(self):
240 el_xpath = self.element.xpath() 241 if self.ident in self.unsupported: 242 raise ExpressionError( 243 "The psuedo-class %r is unsupported" % self.ident) 244 method = '_xpath_' + self.ident.replace('-', '_') 245 if not hasattr(self, method): 246 raise ExpressionError( 247 "The psuedo-class %r is unknown" % self.ident) 248 method = getattr(self, method) 249 el_xpath = method(el_xpath) 250 return el_xpath
251
252 - def _xpath_checked(self, xpath):
253 # FIXME: is this really all the elements? 254 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 255 return xpath
256
257 - def _xpath_root(self, xpath):
258 # if this element is the root element 259 raise NotImplementedError
260
261 - def _xpath_first_child(self, xpath):
262 xpath.add_star_prefix() 263 xpath.add_name_test() 264 xpath.add_condition('position() = 1') 265 return xpath
266
267 - def _xpath_last_child(self, xpath):
268 xpath.add_star_prefix() 269 xpath.add_name_test() 270 xpath.add_condition('position() = last()') 271 return xpath
272
273 - def _xpath_first_of_type(self, xpath):
274 if xpath.element == '*': 275 raise NotImplementedError( 276 "*:first-of-type is not implemented") 277 xpath.add_star_prefix() 278 xpath.add_condition('position() = 1') 279 return xpath
280
281 - def _xpath_last_of_type(self, xpath):
282 if xpath.element == '*': 283 raise NotImplementedError( 284 "*:last-of-type is not implemented") 285 xpath.add_star_prefix() 286 xpath.add_condition('position() = last()') 287 return xpath
288
289 - def _xpath_only_child(self, xpath):
290 xpath.add_name_test() 291 xpath.add_star_prefix() 292 xpath.add_condition('last() = 1') 293 return xpath
294
295 - def _xpath_only_of_type(self, xpath):
296 if xpath.element == '*': 297 raise NotImplementedError( 298 "*:only-of-type is not implemented") 299 xpath.add_condition('last() = 1') 300 return xpath
301
302 - def _xpath_empty(self, xpath):
303 xpath.add_condition("not(*) and not(normalize-space())") 304 return xpath
305
306 -class Attrib(object):
307 """ 308 Represents selector[namespace|attrib operator value] 309 """ 310
311 - def __init__(self, selector, namespace, attrib, operator, value):
312 self.selector = selector 313 self.namespace = namespace 314 self.attrib = attrib 315 self.operator = operator 316 self.value = value
317
318 - def __repr__(self):
319 if self.operator == 'exists': 320 return '%s[%r[%s]]' % ( 321 self.__class__.__name__, 322 self.selector, 323 self._format_attrib()) 324 else: 325 return '%s[%r[%s %s %r]]' % ( 326 self.__class__.__name__, 327 self.selector, 328 self._format_attrib(), 329 self.operator, 330 self.value)
331
332 - def _format_attrib(self):
333 if self.namespace == '*': 334 return self.attrib 335 else: 336 return '%s|%s' % (self.namespace, self.attrib)
337
338 - def _xpath_attrib(self):
339 # FIXME: if attrib is *? 340 if self.namespace == '*': 341 return '@' + self.attrib 342 else: 343 return '@%s:%s' % (self.namespace, self.attrib)
344
345 - def xpath(self):
346 path = self.selector.xpath() 347 attrib = self._xpath_attrib() 348 value = self.value 349 if self.operator == 'exists': 350 assert not value 351 path.add_condition(attrib) 352 elif self.operator == '=': 353 path.add_condition('%s = %s' % (attrib, 354 xpath_literal(value))) 355 elif self.operator == '!=': 356 # FIXME: this seems like a weird hack... 357 if value: 358 path.add_condition('not(%s) or %s != %s' 359 % (attrib, attrib, xpath_literal(value))) 360 else: 361 path.add_condition('%s != %s' 362 % (attrib, xpath_literal(value))) 363 #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) 364 elif self.operator == '~=': 365 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) 366 elif self.operator == '|=': 367 # Weird, but true... 368 path.add_condition('%s = %s or starts-with(%s, %s)' % ( 369 attrib, xpath_literal(value), 370 attrib, xpath_literal(value + '-'))) 371 elif self.operator == '^=': 372 path.add_condition('starts-with(%s, %s)' % ( 373 attrib, xpath_literal(value))) 374 elif self.operator == '$=': 375 # Oddly there is a starts-with in XPath 1.0, but not ends-with 376 path.add_condition('substring(%s, string-length(%s)-%s) = %s' 377 % (attrib, attrib, len(value)-1, xpath_literal(value))) 378 elif self.operator == '*=': 379 # FIXME: case sensitive? 380 path.add_condition('contains(%s, %s)' % ( 381 attrib, xpath_literal(value))) 382 else: 383 assert 0, ("Unknown operator: %r" % self.operator) 384 return path
385
386 -class Element(object):
387 """ 388 Represents namespace|element 389 """ 390
391 - def __init__(self, namespace, element):
392 self.namespace = namespace 393 self.element = element
394
395 - def __repr__(self):
396 return '%s[%s]' % ( 397 self.__class__.__name__, 398 self._format_element())
399
400 - def _format_element(self):
401 if self.namespace == '*': 402 return self.element 403 else: 404 return '%s|%s' % (self.namespace, self.element)
405
406 - def xpath(self):
407 if self.namespace == '*': 408 el = self.element.lower() 409 else: 410 # FIXME: Should we lowercase here? 411 el = '%s:%s' % (self.namespace, self.element) 412 return XPathExpr(element=el)
413
414 -class Hash(object):
415 """ 416 Represents selector#id 417 """ 418
419 - def __init__(self, selector, id):
420 self.selector = selector 421 self.id = id
422
423 - def __repr__(self):
424 return '%s[%r#%s]' % ( 425 self.__class__.__name__, 426 self.selector, self.id)
427
428 - def xpath(self):
429 path = self.selector.xpath() 430 path.add_condition('@id = %s' % xpath_literal(self.id)) 431 return path
432
433 -class Or(object):
434
435 - def __init__(self, items):
436 self.items = items
437 - def __repr__(self):
438 return '%s(%r)' % ( 439 self.__class__.__name__, 440 self.items)
441
442 - def xpath(self):
443 paths = [item.xpath() for item in self.items] 444 return XPathExprOr(paths)
445
446 -class CombinedSelector(object):
447 448 _method_mapping = { 449 ' ': 'descendant', 450 '>': 'child', 451 '+': 'direct_adjacent', 452 '~': 'indirect_adjacent', 453 } 454
455 - def __init__(self, selector, combinator, subselector):
456 assert selector is not None 457 self.selector = selector 458 self.combinator = combinator 459 self.subselector = subselector
460
461 - def __repr__(self):
462 if self.combinator == ' ': 463 comb = '<followed>' 464 else: 465 comb = self.combinator 466 return '%s[%r %s %r]' % ( 467 self.__class__.__name__, 468 self.selector, 469 comb, 470 self.subselector)
471
472 - def xpath(self):
473 if self.combinator not in self._method_mapping: 474 raise ExpressionError( 475 "Unknown combinator: %r" % self.combinator) 476 method = '_xpath_' + self._method_mapping[self.combinator] 477 method = getattr(self, method) 478 path = self.selector.xpath() 479 return method(path, self.subselector)
480
481 - def _xpath_descendant(self, xpath, sub):
482 # when sub is a descendant in any way of xpath 483 xpath.join('/descendant::', sub.xpath()) 484 return xpath
485
486 - def _xpath_child(self, xpath, sub):
487 # when sub is an immediate child of xpath 488 xpath.join('/', sub.xpath()) 489 return xpath
490
491 - def _xpath_direct_adjacent(self, xpath, sub):
492 # when sub immediately follows xpath 493 xpath.join('/following-sibling::', sub.xpath()) 494 xpath.add_name_test() 495 xpath.add_condition('position() = 1') 496 return xpath
497
498 - def _xpath_indirect_adjacent(self, xpath, sub):
499 # when sub comes somewhere after xpath as a sibling 500 xpath.join('/following-sibling::', sub.xpath()) 501 return xpath
502 503 ############################## 504 ## XPathExpr objects: 505 506 _el_re = re.compile(r'^\w+\s*$', re.UNICODE) 507 _id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) 508 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) 509
510 -def css_to_xpath(css_expr, prefix='descendant-or-self::'):
511 if isinstance(css_expr, _basestring): 512 match = _el_re.search(css_expr) 513 if match is not None: 514 return '%s%s' % (prefix, match.group(0).strip()) 515 match = _id_re.search(css_expr) 516 if match is not None: 517 return "%s%s[@id = '%s']" % ( 518 prefix, match.group(1) or '*', match.group(2)) 519 match = _class_re.search(css_expr) 520 if match is not None: 521 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 522 prefix, match.group(1) or '*', match.group(2)) 523 css_expr = parse(css_expr) 524 expr = css_expr.xpath() 525 assert expr is not None, ( 526 "Got None for xpath expression from %s" % repr(css_expr)) 527 if prefix: 528 expr.add_prefix(prefix) 529 return _unicode(expr)
530
531 -class XPathExpr(object):
532
533 - def __init__(self, prefix=None, path=None, element='*', condition=None, 534 star_prefix=False):
535 self.prefix = prefix 536 self.path = path 537 self.element = element 538 self.condition = condition 539 self.star_prefix = star_prefix
540
541 - def __str__(self):
542 path = '' 543 if self.prefix is not None: 544 path += _unicode(self.prefix) 545 if self.path is not None: 546 path += _unicode(self.path) 547 path += _unicode(self.element) 548 if self.condition: 549 path += '[%s]' % self.condition 550 return path
551
552 - def __repr__(self):
553 return '%s[%s]' % ( 554 self.__class__.__name__, self)
555
556 - def add_condition(self, condition):
557 if self.condition: 558 self.condition = '%s and (%s)' % (self.condition, condition) 559 else: 560 self.condition = condition
561
562 - def add_path(self, part):
563 if self.path is None: 564 self.path = self.element 565 else: 566 self.path += self.element 567 self.element = part
568
569 - def add_prefix(self, prefix):
570 if self.prefix: 571 self.prefix = prefix + self.prefix 572 else: 573 self.prefix = prefix
574
575 - def add_name_test(self):
576 if self.element == '*': 577 # We weren't doing a test anyway 578 return 579 self.add_condition("name() = %s" % xpath_literal(self.element)) 580 self.element = '*'
581
582 - def add_star_prefix(self):
583 """ 584 Adds a /* prefix if there is no prefix. This is when you need 585 to keep context's constrained to a single parent. 586 """ 587 if self.path: 588 self.path += '*/' 589 else: 590 self.path = '*/' 591 self.star_prefix = True
592
593 - def join(self, combiner, other):
594 prefix = _unicode(self) 595 prefix += combiner 596 path = (other.prefix or '') + (other.path or '') 597 # We don't need a star prefix if we are joining to this other 598 # prefix; so we'll get rid of it 599 if other.star_prefix and path == '*/': 600 path = '' 601 self.prefix = prefix 602 self.path = path 603 self.element = other.element 604 self.condition = other.condition
605
606 -class XPathExprOr(XPathExpr):
607 """ 608 Represents |'d expressions. Note that unfortunately it isn't 609 the union, it's the sum, so duplicate elements will appear. 610 """ 611
612 - def __init__(self, items, prefix=None):
613 for item in items: 614 assert item is not None 615 self.items = items 616 self.prefix = prefix
617
618 - def __str__(self):
619 prefix = self.prefix or '' 620 return ' | '.join(["%s%s" % (prefix,i) for i in self.items])
621 622 split_at_single_quotes = re.compile("('+)").split 623
624 -def xpath_literal(s):
625 if isinstance(s, Element): 626 # This is probably a symbol that looks like an expression... 627 s = s._format_element() 628 else: 629 s = _unicode(s) 630 if "'" not in s: 631 s = "'%s'" % s 632 elif '"' not in s: 633 s = '"%s"' % s 634 else: 635 s = "concat(%s)" % ','.join([ 636 (("'" in part) and '"%s"' or "'%s'") % part 637 for part in split_at_single_quotes(s) if part 638 ]) 639 return s
640 641 ############################## 642 ## Parsing functions 643
644 -def parse(string):
645 stream = TokenStream(tokenize(string)) 646 stream.source = string 647 try: 648 return parse_selector_group(stream) 649 except SelectorSyntaxError: 650 import sys 651 e = sys.exc_info()[1] 652 message = "%s at %s -> %r" % ( 653 e, stream.used, stream.peek()) 654 e.msg = message 655 if sys.version_info < (2,6): 656 e.message = message 657 e.args = tuple([message]) 658 raise
659
660 -def parse_selector_group(stream):
661 result = [] 662 while 1: 663 result.append(parse_selector(stream)) 664 if stream.peek() == ',': 665 stream.next() 666 else: 667 break 668 if len(result) == 1: 669 return result[0] 670 else: 671 return Or(result)
672
673 -def parse_selector(stream):
674 result = parse_simple_selector(stream) 675 while 1: 676 peek = stream.peek() 677 if peek == ',' or peek is None: 678 return result 679 elif peek in ('+', '>', '~'): 680 # A combinator 681 combinator = stream.next() 682 else: 683 combinator = ' ' 684 consumed = len(stream.used) 685 next_selector = parse_simple_selector(stream) 686 if consumed == len(stream.used): 687 raise SelectorSyntaxError( 688 "Expected selector, got '%s'" % stream.peek()) 689 result = CombinedSelector(result, combinator, next_selector) 690 return result
691
692 -def parse_simple_selector(stream):
693 peek = stream.peek() 694 if peek != '*' and not isinstance(peek, Symbol): 695 element = namespace = '*' 696 else: 697 next = stream.next() 698 if next != '*' and not isinstance(next, Symbol): 699 raise SelectorSyntaxError( 700 "Expected symbol, got '%s'" % next) 701 if stream.peek() == '|': 702 namespace = next 703 stream.next() 704 element = stream.next() 705 if element != '*' and not isinstance(next, Symbol): 706 raise SelectorSyntaxError( 707 "Expected symbol, got '%s'" % next) 708 else: 709 namespace = '*' 710 element = next 711 result = Element(namespace, element) 712 has_hash = False 713 while 1: 714 peek = stream.peek() 715 if peek == '#': 716 if has_hash: 717 # You can't have two hashes 718 # (FIXME: is there some more general rule I'm missing?) 719 break 720 stream.next() 721 result = Hash(result, stream.next()) 722 has_hash = True 723 continue 724 elif peek == '.': 725 stream.next() 726 result = Class(result, stream.next()) 727 continue 728 elif peek == '[': 729 stream.next() 730 result = parse_attrib(result, stream) 731 next = stream.next() 732 if not next == ']': 733 raise SelectorSyntaxError( 734 "] expected, got '%s'" % next) 735 continue 736 elif peek == ':' or peek == '::': 737 type = stream.next() 738 ident = stream.next() 739 if not isinstance(ident, Symbol): 740 raise SelectorSyntaxError( 741 "Expected symbol, got '%s'" % ident) 742 if stream.peek() == '(': 743 stream.next() 744 peek = stream.peek() 745 if isinstance(peek, String): 746 selector = stream.next() 747 elif isinstance(peek, Symbol) and is_int(peek): 748 selector = int(stream.next()) 749 else: 750 # FIXME: parse_simple_selector, or selector, or...? 751 selector = parse_simple_selector(stream) 752 next = stream.next() 753 if not next == ')': 754 raise SelectorSyntaxError( 755 "Expected ')', got '%s' and '%s'" 756 % (next, selector)) 757 result = Function(result, type, ident, selector) 758 else: 759 result = Pseudo(result, type, ident) 760 continue 761 else: 762 if peek == ' ': 763 stream.next() 764 break 765 # FIXME: not sure what "negation" is 766 return result
767
768 -def is_int(v):
769 try: 770 int(v) 771 except ValueError: 772 return False 773 else: 774 return True
775
776 -def parse_attrib(selector, stream):
777 attrib = stream.next() 778 if stream.peek() == '|': 779 namespace = attrib 780 stream.next() 781 attrib = stream.next() 782 else: 783 namespace = '*' 784 if stream.peek() == ']': 785 return Attrib(selector, namespace, attrib, 'exists', None) 786 op = stream.next() 787 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 788 raise SelectorSyntaxError( 789 "Operator expected, got '%s'" % op) 790 value = stream.next() 791 if not isinstance(value, (Symbol, String)): 792 raise SelectorSyntaxError( 793 "Expected string or symbol, got '%s'" % value) 794 return Attrib(selector, namespace, attrib, op, value)
795
796 -def parse_series(s):
797 """ 798 Parses things like '1n+2', or 'an+b' generally, returning (a, b) 799 """ 800 if isinstance(s, Element): 801 s = s._format_element() 802 if not s or s == '*': 803 # Happens when there's nothing, which the CSS parser thinks of as * 804 return (0, 0) 805 if isinstance(s, int): 806 # Happens when you just get a number 807 return (0, s) 808 if s == 'odd': 809 return (2, 1) 810 elif s == 'even': 811 return (2, 0) 812 elif s == 'n': 813 return (1, 0) 814 if 'n' not in s: 815 # Just a b 816 return (0, int(s)) 817 a, b = s.split('n', 1) 818 if not a: 819 a = 1 820 elif a == '-' or a == '+': 821 a = int(a+'1') 822 else: 823 a = int(a) 824 if not b: 825 b = 0 826 elif b == '-' or b == '+': 827 b = int(b+'1') 828 else: 829 b = int(b) 830 return (a, b)
831 832 833 ############################################################ 834 ## Tokenizing 835 ############################################################ 836 837 _whitespace_re = re.compile(r'\s+', re.UNICODE) 838 839 _comment_re = re.compile(r'/\*.*?\*/', re.DOTALL) 840 841 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') 842
843 -def tokenize(s):
844 pos = 0 845 s = _comment_re.sub('', s) 846 while 1: 847 match = _whitespace_re.match(s, pos=pos) 848 if match: 849 preceding_whitespace_pos = pos 850 pos = match.end() 851 else: 852 preceding_whitespace_pos = 0 853 if pos >= len(s): 854 return 855 match = _count_re.match(s, pos=pos) 856 if match and match.group() != 'n': 857 sym = s[pos:match.end()] 858 yield Symbol(sym, pos) 859 pos = match.end() 860 continue 861 c = s[pos] 862 c2 = s[pos:pos+2] 863 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 864 yield Token(c2, pos) 865 pos += 2 866 continue 867 if c in '>+~,.*=[]()|:#': 868 if c in '.#' and preceding_whitespace_pos > 0: 869 yield Token(' ', preceding_whitespace_pos) 870 yield Token(c, pos) 871 pos += 1 872 continue 873 if c == '"' or c == "'": 874 # Quoted string 875 old_pos = pos 876 sym, pos = tokenize_escaped_string(s, pos) 877 yield String(sym, old_pos) 878 continue 879 old_pos = pos 880 sym, pos = tokenize_symbol(s, pos) 881 yield Symbol(sym, old_pos) 882 continue
883 884 split_at_string_escapes = re.compile(r'(\\(?:%s))' 885 % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', 886 '[^A-Fa-f0-9]'])).split 887
888 -def unescape_string_literal(literal):
889 substrings = [] 890 for substring in split_at_string_escapes(literal): 891 if not substring: 892 continue 893 elif '\\' in substring: 894 if substring[0] == '\\' and len(substring) > 1: 895 substring = substring[1:] 896 if substring[0] in '0123456789ABCDEFabcdef': 897 # int() correctly ignores the potentially trailing whitespace 898 substring = _unichr(int(substring, 16)) 899 else: 900 raise SelectorSyntaxError( 901 "Invalid escape sequence %r in string %r" 902 % (substring.split('\\')[1], literal)) 903 substrings.append(substring) 904 return ''.join(substrings)
905
906 -def tokenize_escaped_string(s, pos):
907 quote = s[pos] 908 assert quote in ('"', "'") 909 pos = pos+1 910 start = pos 911 while 1: 912 next = s.find(quote, pos) 913 if next == -1: 914 raise SelectorSyntaxError( 915 "Expected closing %s for string in: %r" 916 % (quote, s[start:])) 917 result = s[start:next] 918 if result.endswith('\\'): 919 # next quote character is escaped 920 pos = next+1 921 continue 922 if '\\' in result: 923 result = unescape_string_literal(result) 924 return result, next+1
925 926 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 927
928 -def tokenize_symbol(s, pos):
929 start = pos 930 match = _illegal_symbol.search(s, pos=pos) 931 if not match: 932 # Goes to end of s 933 return s[start:], len(s) 934 if match.start() == pos: 935 assert 0, ( 936 "Unexpected symbol: %r at %s" % (s[pos], pos)) 937 if not match: 938 result = s[start:] 939 pos = len(s) 940 else: 941 result = s[start:match.start()] 942 pos = match.start() 943 try: 944 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 945 except UnicodeDecodeError: 946 import sys 947 e = sys.exc_info()[1] 948 raise SelectorSyntaxError( 949 "Bad symbol %r: %s" % (result, e)) 950 return result, pos
951
952 -class TokenStream(object):
953
954 - def __init__(self, tokens, source=None):
955 self.used = [] 956 self.tokens = iter(tokens) 957 self.source = source 958 self.peeked = None 959 self._peeking = False 960 try: 961 self.next_token = self.tokens.next 962 except AttributeError: 963 # Python 3 964 self.next_token = self.tokens.__next__
965
966 - def next(self):
967 if self._peeking: 968 self._peeking = False 969 self.used.append(self.peeked) 970 return self.peeked 971 else: 972 try: 973 next = self.next_token() 974 self.used.append(next) 975 return next 976 except StopIteration: 977 return None
978
979 - def __iter__(self):
980 return iter(self.next, None)
981
982 - def peek(self):
983 if not self._peeking: 984 try: 985 self.peeked = self.next_token() 986 except StopIteration: 987 return None 988 self._peeking = True 989 return self.peeked
990