lxml.cssselect

1 """CSS Selectors based on XPath. 2 3 This module supports selecting XML/HTML tags based on CSS selectors. 4 See the `CSSSelector` class for details. 5 """ 6 7 import re 8 from lxml import etree 9 10 __all__ = ['SelectorSyntaxError', 'ExpressionError', 11 'CSSSelector'] 12 13 try: 14 _basestring = basestring 15 except NameError: 16 _basestring = str 17

18 -class SelectorSyntaxError(SyntaxError):

19 pass

20

21 -class ExpressionError(RuntimeError):

22 pass

23

24 -class CSSSelector(etree.XPath):

25 """A CSS selector. 26 27 Usage:: 28 29 >>> from lxml import etree, cssselect 30 >>> select = cssselect.CSSSelector("a tag > child") 31 32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 33 >>> [ el.tag for el in select(root) ] 34 ['child'] 35 """

36 - def __init__(self, css):

37 path = css_to_xpath(css) 38 etree.XPath.__init__(self, path) 39 self.css = css

40

41 - def __repr__(self):

42 return '<%s %s for %r>' % ( 43 self.__class__.__name__, 44 hex(abs(id(self)))[2:], 45 self.css)

46 47 ############################## 48 ## Token objects: 49 50 try: 51 _unicode = unicode 52 _unichr = unichr 53 except NameError: 54 # Python 3 55 _unicode = str 56 _unichr = chr 57

58 -class _UniToken(_unicode):

59 - def __new__(cls, contents, pos):

60 obj = _unicode.__new__(cls, contents) 61 obj.pos = pos 62 return obj

63

64 - def __repr__(self):

65 return '%s(%s, %r)' % ( 66 self.__class__.__name__, 67 _unicode.__repr__(self), 68 self.pos)

69

70 -class Symbol(_UniToken):

71 pass

72

73 -class String(_UniToken):

74 pass

75

76 -class Token(_UniToken):

77 pass

78 79 ############################################################ 80 ## Parsing 81 ############################################################ 82 83 ############################## 84 ## Syntax objects: 85

86 -class Class(object):

87 """ 88 Represents selector.class_name 89 """ 90

91 - def __init__(self, selector, class_name):

92 self.selector = selector 93 self.class_name = class_name

94

95 - def __repr__(self):

96 return '%s[%r.%s]' % ( 97 self.__class__.__name__, 98 self.selector, 99 self.class_name)

100

101 - def xpath(self):

102 sel_xpath = self.selector.xpath() 103 sel_xpath.add_condition( 104 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) 105 return sel_xpath

106

107 -class Function(object):

108 """ 109 Represents selector:name(expr) 110 """ 111 112 unsupported = [ 113 'target', 'lang', 'enabled', 'disabled',] 114

115 - def __init__(self, selector, type, name, expr):

116 self.selector = selector 117 self.type = type 118 self.name = name 119 self.expr = expr

120

121 - def __repr__(self):

122 return '%s[%r%s%s(%r)]' % ( 123 self.__class__.__name__, 124 self.selector, 125 self.type, self.name, self.expr)

126

127 - def xpath(self):

128 sel_path = self.selector.xpath() 129 if self.name in self.unsupported: 130 raise ExpressionError( 131 "The psuedo-class %r is not supported" % self.name) 132 method = '_xpath_' + self.name.replace('-', '_') 133 if not hasattr(self, method): 134 raise ExpressionError( 135 "The psuedo-class %r is unknown" % self.name) 136 method = getattr(self, method) 137 return method(sel_path, self.expr)

138

139 - def _xpath_nth_child(self, xpath, expr, last=False, 140 add_name_test=True):

141 a, b = parse_series(expr) 142 if not a and not b and not last: 143 # a=0 means nothing is returned... 144 xpath.add_condition('false() and position() = 0') 145 return xpath 146 if add_name_test: 147 xpath.add_name_test() 148 xpath.add_star_prefix() 149 if a == 0: 150 if last: 151 b = 'last() - %s' % b 152 xpath.add_condition('position() = %s' % b) 153 return xpath 154 if last: 155 # FIXME: I'm not sure if this is right 156 a = -a 157 b = -b 158 if b > 0: 159 b_neg = str(-b) 160 else: 161 b_neg = '+%s' % (-b) 162 if a != 1: 163 expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 164 else: 165 expr = [] 166 if b >= 0: 167 expr.append('position() >= %s' % b) 168 elif b < 0 and last: 169 expr.append('position() < (last() %s)' % b) 170 expr = ' and '.join(expr) 171 if expr: 172 xpath.add_condition(expr) 173 return xpath

174 # FIXME: handle an+b, odd, even 175 # an+b means every-a, plus b, e.g., 2n+1 means odd 176 # 0n+b means b 177 # n+0 means a=1, i.e., all elements 178 # an means every a elements, i.e., 2n means even 179 # -n means -1n 180 # -1n+6 means elements 6 and previous 181

182 - def _xpath_nth_last_child(self, xpath, expr):

183 return self._xpath_nth_child(xpath, expr, last=True)

184

185 - def _xpath_nth_of_type(self, xpath, expr):

186 if xpath.element == '*': 187 raise NotImplementedError( 188 "*:nth-of-type() is not implemented") 189 return self._xpath_nth_child(xpath, expr, add_name_test=False)

190

191 - def _xpath_nth_last_of_type(self, xpath, expr):

192 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)

193

194 - def _xpath_contains(self, xpath, expr):

195 # text content, minus tags, must contain expr 196 if isinstance(expr, Element): 197 expr = expr._format_element() 198 xpath.add_condition('contains(css:lower-case(string(.)), %s)' 199 % xpath_literal(expr.lower())) 200 # FIXME: Currently case insensitive matching doesn't seem to be happening 201 return xpath

202

203 - def _xpath_not(self, xpath, expr):

204 # everything for which not expr applies 205 expr = expr.xpath() 206 cond = expr.condition 207 # FIXME: should I do something about element_path? 208 xpath.add_condition('not(%s)' % cond) 209 return xpath

210

211 -def _make_lower_case(context, s):

212 return s.lower()

213 214 ns = etree.FunctionNamespace('/css/') 215 ns.prefix = 'css' 216 ns['lower-case'] = _make_lower_case 217

218 -class Pseudo(object):

219 """ 220 Represents selector:ident 221 """ 222 223 unsupported = ['indeterminate', 'first-line', 'first-letter', 224 'selection', 'before', 'after', 'link', 'visited', 225 'active', 'focus', 'hover'] 226

227 - def __init__(self, element, type, ident):

228 self.element = element 229 assert type in (':', '::') 230 self.type = type 231 self.ident = ident

232

233 - def __repr__(self):

234 return '%s[%r%s%s]' % ( 235 self.__class__.__name__, 236 self.element, 237 self.type, self.ident)

238

239 - def xpath(self):

240 el_xpath = self.element.xpath() 241 if self.ident in self.unsupported: 242 raise ExpressionError( 243 "The psuedo-class %r is unsupported" % self.ident) 244 method = '_xpath_' + self.ident.replace('-', '_') 245 if not hasattr(self, method): 246 raise ExpressionError( 247 "The psuedo-class %r is unknown" % self.ident) 248 method = getattr(self, method) 249 el_xpath = method(el_xpath) 250 return el_xpath

251

252 - def _xpath_checked(self, xpath):

253 # FIXME: is this really all the elements? 254 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 255 return xpath

256

257 - def _xpath_root(self, xpath):

258 # if this element is the root element 259 raise NotImplementedError

260

261 - def _xpath_first_child(self, xpath):

262 xpath.add_star_prefix() 263 xpath.add_name_test() 264 xpath.add_condition('position() = 1') 265 return xpath

266

267 - def _xpath_last_child(self, xpath):

268 xpath.add_star_prefix() 269 xpath.add_name_test() 270 xpath.add_condition('position() = last()') 271 return xpath

272

273 - def _xpath_first_of_type(self, xpath):

274 if xpath.element == '*': 275 raise NotImplementedError( 276 "*:first-of-type is not implemented") 277 xpath.add_star_prefix() 278 xpath.add_condition('position() = 1') 279 return xpath

280

281 - def _xpath_last_of_type(self, xpath):

282 if xpath.element == '*': 283 raise NotImplementedError( 284 "*:last-of-type is not implemented") 285 xpath.add_star_prefix() 286 xpath.add_condition('position() = last()') 287 return xpath

288

289 - def _xpath_only_child(self, xpath):

290 xpath.add_name_test() 291 xpath.add_star_prefix() 292 xpath.add_condition('last() = 1') 293 return xpath

294

295 - def _xpath_only_of_type(self, xpath):

296 if xpath.element == '*': 297 raise NotImplementedError( 298 "*:only-of-type is not implemented") 299 xpath.add_condition('last() = 1') 300 return xpath

301

302 - def _xpath_empty(self, xpath):

303 xpath.add_condition("not(*) and not(normalize-space())") 304 return xpath

305

306 -class Attrib(object):

307 """ 308 Represents selector[namespace|attrib operator value] 309 """ 310

311 - def __init__(self, selector, namespace, attrib, operator, value):

312 self.selector = selector 313 self.namespace = namespace 314 self.attrib = attrib 315 self.operator = operator 316 self.value = value

317

318 - def __repr__(self):

319 if self.operator == 'exists': 320 return '%s[%r[%s]]' % ( 321 self.__class__.__name__, 322 self.selector, 323 self._format_attrib()) 324 else: 325 return '%s[%r[%s %s %r]]' % ( 326 self.__class__.__name__, 327 self.selector, 328 self._format_attrib(), 329 self.operator, 330 self.value)

331

332 - def _format_attrib(self):

333 if self.namespace == '*': 334 return self.attrib 335 else: 336 return '%s|%s' % (self.namespace, self.attrib)

337

338 - def _xpath_attrib(self):

339 # FIXME: if attrib is *? 340 if self.namespace == '*': 341 return '@' + self.attrib 342 else: 343 return '@%s:%s' % (self.namespace, self.attrib)

344

345 - def xpath(self):

346 path = self.selector.xpath() 347 attrib = self._xpath_attrib() 348 value = self.value 349 if self.operator == 'exists': 350 assert not value 351 path.add_condition(attrib) 352 elif self.operator == '=': 353 path.add_condition('%s = %s' % (attrib, 354 xpath_literal(value))) 355 elif self.operator == '!=': 356 # FIXME: this seems like a weird hack... 357 if value: 358 path.add_condition('not(%s) or %s != %s' 359 % (attrib, attrib, xpath_literal(value))) 360 else: 361 path.add_condition('%s != %s' 362 % (attrib, xpath_literal(value))) 363 #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) 364 elif self.operator == '~=': 365 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) 366 elif self.operator == '|=': 367 # Weird, but true... 368 path.add_condition('%s = %s or starts-with(%s, %s)' % ( 369 attrib, xpath_literal(value), 370 attrib, xpath_literal(value + '-'))) 371 elif self.operator == '^=': 372 path.add_condition('starts-with(%s, %s)' % ( 373 attrib, xpath_literal(value))) 374 elif self.operator == '$=': 375 # Oddly there is a starts-with in XPath 1.0, but not ends-with 376 path.add_condition('substring(%s, string-length(%s)-%s) = %s' 377 % (attrib, attrib, len(value)-1, xpath_literal(value))) 378 elif self.operator == '*=': 379 # FIXME: case sensitive? 380 path.add_condition('contains(%s, %s)' % ( 381 attrib, xpath_literal(value))) 382 else: 383 assert 0, ("Unknown operator: %r" % self.operator) 384 return path

385

386 -class Element(object):

387 """ 388 Represents namespace|element 389 """ 390

391 - def __init__(self, namespace, element):

392 self.namespace = namespace 393 self.element = element

394

395 - def __repr__(self):

396 return '%s[%s]' % ( 397 self.__class__.__name__, 398 self._format_element())

399

400 - def _format_element(self):

401 if self.namespace == '*': 402 return self.element 403 else: 404 return '%s|%s' % (self.namespace, self.element)

405

406 - def xpath(self):

407 if self.namespace == '*': 408 el = self.element.lower() 409 else: 410 # FIXME: Should we lowercase here? 411 el = '%s:%s' % (self.namespace, self.element) 412 return XPathExpr(element=el)

413

414 -class Hash(object):

415 """ 416 Represents selector#id 417 """ 418

419 - def __init__(self, selector, id):

420 self.selector = selector 421 self.id = id

422

423 - def __repr__(self):

424 return '%s[%r#%s]' % ( 425 self.__class__.__name__, 426 self.selector, self.id)

427

428 - def xpath(self):

429 path = self.selector.xpath() 430 path.add_condition('@id = %s' % xpath_literal(self.id)) 431 return path

432

433 -class Or(object):

434

435 - def __init__(self, items):

436 self.items = items

437 - def __repr__(self):

438 return '%s(%r)' % ( 439 self.__class__.__name__, 440 self.items)

441

442 - def xpath(self):

443 paths = [item.xpath() for item in self.items] 444 return XPathExprOr(paths)

445

446 -class CombinedSelector(object):

447 448 _method_mapping = { 449 ' ': 'descendant', 450 '>': 'child', 451 '+': 'direct_adjacent', 452 '~': 'indirect_adjacent', 453 } 454

455 - def __init__(self, selector, combinator, subselector):

456 assert selector is not None 457 self.selector = selector 458 self.combinator = combinator 459 self.subselector = subselector

460

461 - def __repr__(self):

462 if self.combinator == ' ': 463 comb = '<followed>' 464 else: 465 comb = self.combinator 466 return '%s[%r %s %r]' % ( 467 self.__class__.__name__, 468 self.selector, 469 comb, 470 self.subselector)

471

472 - def xpath(self):

473 if self.combinator not in self._method_mapping: 474 raise ExpressionError( 475 "Unknown combinator: %r" % self.combinator) 476 method = '_xpath_' + self._method_mapping[self.combinator] 477 method = getattr(self, method) 478 path = self.selector.xpath() 479 return method(path, self.subselector)

480

481 - def _xpath_descendant(self, xpath, sub):

482 # when sub is a descendant in any way of xpath 483 xpath.join('/descendant::', sub.xpath()) 484 return xpath

485

486 - def _xpath_child(self, xpath, sub):

487 # when sub is an immediate child of xpath 488 xpath.join('/', sub.xpath()) 489 return xpath

490

491 - def _xpath_direct_adjacent(self, xpath, sub):

492 # when sub immediately follows xpath 493 xpath.join('/following-sibling::', sub.xpath()) 494 xpath.add_name_test() 495 xpath.add_condition('position() = 1') 496 return xpath

497

498 - def _xpath_indirect_adjacent(self, xpath, sub):

499 # when sub comes somewhere after xpath as a sibling 500 xpath.join('/following-sibling::', sub.xpath()) 501 return xpath

502 503 ############################## 504 ## XPathExpr objects: 505 506 _el_re = re.compile(r'^\w+\s*$', re.UNICODE) 507 _id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) 508 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) 509

510 -def css_to_xpath(css_expr, prefix='descendant-or-self::'):

511 if isinstance(css_expr, _basestring): 512 match = _el_re.search(css_expr) 513 if match is not None: 514 return '%s%s' % (prefix, match.group(0).strip()) 515 match = _id_re.search(css_expr) 516 if match is not None: 517 return "%s%s[@id = '%s']" % ( 518 prefix, match.group(1) or '*', match.group(2)) 519 match = _class_re.search(css_expr) 520 if match is not None: 521 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 522 prefix, match.group(1) or '*', match.group(2)) 523 css_expr = parse(css_expr) 524 expr = css_expr.xpath() 525 assert expr is not None, ( 526 "Got None for xpath expression from %s" % repr(css_expr)) 527 if prefix: 528 expr.add_prefix(prefix) 529 return _unicode(expr)

530

531 -class XPathExpr(object):

532

533 - def __init__(self, prefix=None, path=None, element='*', condition=None, 534 star_prefix=False):

535 self.prefix = prefix 536 self.path = path 537 self.element = element 538 self.condition = condition 539 self.star_prefix = star_prefix

540

541 - def __str__(self):

542 path = '' 543 if self.prefix is not None: 544 path += _unicode(self.prefix) 545 if self.path is not None: 546 path += _unicode(self.path) 547 path += _unicode(self.element) 548 if self.condition: 549 path += '[%s]' % self.condition 550 return path

551

552 - def __repr__(self):

553 return '%s[%s]' % ( 554 self.__class__.__name__, self)

555

556 - def add_condition(self, condition):

557 if self.condition: 558 self.condition = '%s and (%s)' % (self.condition, condition) 559 else: 560 self.condition = condition

561

562 - def add_path(self, part):

563 if self.path is None: 564 self.path = self.element 565 else: 566 self.path += self.element 567 self.element = part

568

569 - def add_prefix(self, prefix):

570 if self.prefix: 571 self.prefix = prefix + self.prefix 572 else: 573 self.prefix = prefix

574

575 - def add_name_test(self):

576 if self.element == '*': 577 # We weren't doing a test anyway 578 return 579 self.add_condition("name() = %s" % xpath_literal(self.element)) 580 self.element = '*'

581

582 - def add_star_prefix(self):

583 """ 584 Adds a /* prefix if there is no prefix. This is when you need 585 to keep context's constrained to a single parent. 586 """ 587 if self.path: 588 self.path += '*/' 589 else: 590 self.path = '*/' 591 self.star_prefix = True

592

593 - def join(self, combiner, other):

594 prefix = _unicode(self) 595 prefix += combiner 596 path = (other.prefix or '') + (other.path or '') 597 # We don't need a star prefix if we are joining to this other 598 # prefix; so we'll get rid of it 599 if other.star_prefix and path == '*/': 600 path = '' 601 self.prefix = prefix 602 self.path = path 603 self.element = other.element 604 self.condition = other.condition

605

606 -class XPathExprOr(XPathExpr):

607 """ 608 Represents |'d expressions. Note that unfortunately it isn't 609 the union, it's the sum, so duplicate elements will appear. 610 """ 611

612 - def __init__(self, items, prefix=None):

613 for item in items: 614 assert item is not None 615 self.items = items 616 self.prefix = prefix

617

618 - def __str__(self):

619 prefix = self.prefix or '' 620 return ' | '.join(["%s%s" % (prefix,i) for i in self.items])

621 622 split_at_single_quotes = re.compile("('+)").split 623

624 -def xpath_literal(s):

625 if isinstance(s, Element): 626 # This is probably a symbol that looks like an expression... 627 s = s._format_element() 628 else: 629 s = _unicode(s) 630 if "'" not in s: 631 s = "'%s'" % s 632 elif '"' not in s: 633 s = '"%s"' % s 634 else: 635 s = "concat(%s)" % ','.join([ 636 (("'" in part) and '"%s"' or "'%s'") % part 637 for part in split_at_single_quotes(s) if part 638 ]) 639 return s

640 641 ############################## 642 ## Parsing functions 643

644 -def parse(string):

645 stream = TokenStream(tokenize(string)) 646 stream.source = string 647 try: 648 return parse_selector_group(stream) 649 except SelectorSyntaxError: 650 import sys 651 e = sys.exc_info()[1] 652 message = "%s at %s -> %r" % ( 653 e, stream.used, stream.peek()) 654 e.msg = message 655 if sys.version_info < (2,6): 656 e.message = message 657 e.args = tuple([message]) 658 raise

659

660 -def parse_selector_group(stream):

661 result = [] 662 while 1: 663 result.append(parse_selector(stream)) 664 if stream.peek() == ',': 665 stream.next() 666 else: 667 break 668 if len(result) == 1: 669 return result[0] 670 else: 671 return Or(result)

672

673 -def parse_selector(stream):

674 result = parse_simple_selector(stream) 675 while 1: 676 peek = stream.peek() 677 if peek == ',' or peek is None: 678 return result 679 elif peek in ('+', '>', '~'): 680 # A combinator 681 combinator = stream.next() 682 else: 683 combinator = ' ' 684 consumed = len(stream.used) 685 next_selector = parse_simple_selector(stream) 686 if consumed == len(stream.used): 687 raise SelectorSyntaxError( 688 "Expected selector, got '%s'" % stream.peek()) 689 result = CombinedSelector(result, combinator, next_selector) 690 return result

691

692 -def parse_simple_selector(stream):

693 peek = stream.peek() 694 if peek != '*' and not isinstance(peek, Symbol): 695 element = namespace = '*' 696 else: 697 next = stream.next() 698 if next != '*' and not isinstance(next, Symbol): 699 raise SelectorSyntaxError( 700 "Expected symbol, got '%s'" % next) 701 if stream.peek() == '|': 702 namespace = next 703 stream.next() 704 element = stream.next() 705 if element != '*' and not isinstance(next, Symbol): 706 raise SelectorSyntaxError( 707 "Expected symbol, got '%s'" % next) 708 else: 709 namespace = '*' 710 element = next 711 result = Element(namespace, element) 712 has_hash = False 713 while 1: 714 peek = stream.peek() 715 if peek == '#': 716 if has_hash: 717 # You can't have two hashes 718 # (FIXME: is there some more general rule I'm missing?) 719 break 720 stream.next() 721 result = Hash(result, stream.next()) 722 has_hash = True 723 continue 724 elif peek == '.': 725 stream.next() 726 result = Class(result, stream.next()) 727 continue 728 elif peek == '[': 729 stream.next() 730 result = parse_attrib(result, stream) 731 next = stream.next() 732 if not next == ']': 733 raise SelectorSyntaxError( 734 "] expected, got '%s'" % next) 735 continue 736 elif peek == ':' or peek == '::': 737 type = stream.next() 738 ident = stream.next() 739 if not isinstance(ident, Symbol): 740 raise SelectorSyntaxError( 741 "Expected symbol, got '%s'" % ident) 742 if stream.peek() == '(': 743 stream.next() 744 peek = stream.peek() 745 if isinstance(peek, String): 746 selector = stream.next() 747 elif isinstance(peek, Symbol) and is_int(peek): 748 selector = int(stream.next()) 749 else: 750 # FIXME: parse_simple_selector, or selector, or...? 751 selector = parse_simple_selector(stream) 752 next = stream.next() 753 if not next == ')': 754 raise SelectorSyntaxError( 755 "Expected ')', got '%s' and '%s'" 756 % (next, selector)) 757 result = Function(result, type, ident, selector) 758 else: 759 result = Pseudo(result, type, ident) 760 continue 761 else: 762 if peek == ' ': 763 stream.next() 764 break 765 # FIXME: not sure what "negation" is 766 return result

767

768 -def is_int(v):

769 try: 770 int(v) 771 except ValueError: 772 return False 773 else: 774 return True

775

776 -def parse_attrib(selector, stream):

777 attrib = stream.next() 778 if stream.peek() == '|': 779 namespace = attrib 780 stream.next() 781 attrib = stream.next() 782 else: 783 namespace = '*' 784 if stream.peek() == ']': 785 return Attrib(selector, namespace, attrib, 'exists', None) 786 op = stream.next() 787 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 788 raise SelectorSyntaxError( 789 "Operator expected, got '%s'" % op) 790 value = stream.next() 791 if not isinstance(value, (Symbol, String)): 792 raise SelectorSyntaxError( 793 "Expected string or symbol, got '%s'" % value) 794 return Attrib(selector, namespace, attrib, op, value)

795

796 -def parse_series(s):

797 """ 798 Parses things like '1n+2', or 'an+b' generally, returning (a, b) 799 """ 800 if isinstance(s, Element): 801 s = s._format_element() 802 if not s or s == '*': 803 # Happens when there's nothing, which the CSS parser thinks of as * 804 return (0, 0) 805 if isinstance(s, int): 806 # Happens when you just get a number 807 return (0, s) 808 if s == 'odd': 809 return (2, 1) 810 elif s == 'even': 811 return (2, 0) 812 elif s == 'n': 813 return (1, 0) 814 if 'n' not in s: 815 # Just a b 816 return (0, int(s)) 817 a, b = s.split('n', 1) 818 if not a: 819 a = 1 820 elif a == '-' or a == '+': 821 a = int(a+'1') 822 else: 823 a = int(a) 824 if not b: 825 b = 0 826 elif b == '-' or b == '+': 827 b = int(b+'1') 828 else: 829 b = int(b) 830 return (a, b)

831 832 833 ############################################################ 834 ## Tokenizing 835 ############################################################ 836 837 _whitespace_re = re.compile(r'\s+', re.UNICODE) 838 839 _comment_re = re.compile(r'/\*.*?\*/', re.DOTALL) 840 841 _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') 842

843 -def tokenize(s):

844 pos = 0 845 s = _comment_re.sub('', s) 846 while 1: 847 match = _whitespace_re.match(s, pos=pos) 848 if match: 849 preceding_whitespace_pos = pos 850 pos = match.end() 851 else: 852 preceding_whitespace_pos = 0 853 if pos >= len(s): 854 return 855 match = _count_re.match(s, pos=pos) 856 if match and match.group() != 'n': 857 sym = s[pos:match.end()] 858 yield Symbol(sym, pos) 859 pos = match.end() 860 continue 861 c = s[pos] 862 c2 = s[pos:pos+2] 863 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 864 yield Token(c2, pos) 865 pos += 2 866 continue 867 if c in '>+~,.*=[]()|:#': 868 if c in '.#' and preceding_whitespace_pos > 0: 869 yield Token(' ', preceding_whitespace_pos) 870 yield Token(c, pos) 871 pos += 1 872 continue 873 if c == '"' or c == "'": 874 # Quoted string 875 old_pos = pos 876 sym, pos = tokenize_escaped_string(s, pos) 877 yield String(sym, old_pos) 878 continue 879 old_pos = pos 880 sym, pos = tokenize_symbol(s, pos) 881 yield Symbol(sym, old_pos) 882 continue

883 884 split_at_string_escapes = re.compile(r'(\\(?:%s))' 885 % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', 886 '[^A-Fa-f0-9]'])).split 887

888 -def unescape_string_literal(literal):

889 substrings = [] 890 for substring in split_at_string_escapes(literal): 891 if not substring: 892 continue 893 elif '\\' in substring: 894 if substring[0] == '\\' and len(substring) > 1: 895 substring = substring[1:] 896 if substring[0] in '0123456789ABCDEFabcdef': 897 # int() correctly ignores the potentially trailing whitespace 898 substring = _unichr(int(substring, 16)) 899 else: 900 raise SelectorSyntaxError( 901 "Invalid escape sequence %r in string %r" 902 % (substring.split('\\')[1], literal)) 903 substrings.append(substring) 904 return ''.join(substrings)

905

906 -def tokenize_escaped_string(s, pos):

907 quote = s[pos] 908 assert quote in ('"', "'") 909 pos = pos+1 910 start = pos 911 while 1: 912 next = s.find(quote, pos) 913 if next == -1: 914 raise SelectorSyntaxError( 915 "Expected closing %s for string in: %r" 916 % (quote, s[start:])) 917 result = s[start:next] 918 if result.endswith('\\'): 919 # next quote character is escaped 920 pos = next+1 921 continue 922 if '\\' in result: 923 result = unescape_string_literal(result) 924 return result, next+1

925 926 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 927

928 -def tokenize_symbol(s, pos):

929 start = pos 930 match = _illegal_symbol.search(s, pos=pos) 931 if not match: 932 # Goes to end of s 933 return s[start:], len(s) 934 if match.start() == pos: 935 assert 0, ( 936 "Unexpected symbol: %r at %s" % (s[pos], pos)) 937 if not match: 938 result = s[start:] 939 pos = len(s) 940 else: 941 result = s[start:match.start()] 942 pos = match.start() 943 try: 944 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 945 except UnicodeDecodeError: 946 import sys 947 e = sys.exc_info()[1] 948 raise SelectorSyntaxError( 949 "Bad symbol %r: %s" % (result, e)) 950 return result, pos

951

952 -class TokenStream(object):

953

954 - def __init__(self, tokens, source=None):

955 self.used = [] 956 self.tokens = iter(tokens) 957 self.source = source 958 self.peeked = None 959 self._peeking = False 960 try: 961 self.next_token = self.tokens.next 962 except AttributeError: 963 # Python 3 964 self.next_token = self.tokens.__next__

965

966 - def next(self):

967 if self._peeking: 968 self._peeking = False 969 self.used.append(self.peeked) 970 return self.peeked 971 else: 972 try: 973 next = self.next_token() 974 self.used.append(next) 975 return next 976 except StopIteration: 977 return None

978

979 - def __iter__(self):

980 return iter(self.next, None)

981

982 - def peek(self):

983 if not self._peeking: 984 try: 985 self.peeked = self.next_token() 986 except StopIteration: 987 return None 988 self._peeking = True 989 return self.peeked

990

Source Code for Module lxml.cssselect