lxml.cssselect

1 """CSS Selectors based on XPath. 2 3 This module supports selecting XML/HTML tags based on CSS selectors. 4 See the `CSSSelector` class for details. 5 """ 6 7 import re 8 from lxml import etree 9 10 __all__ = ['SelectorSyntaxError', 'ExpressionError', 11 'CSSSelector'] 12 13 try: 14 _basestring = basestring 15 except NameError: 16 _basestring = str 17

18 -class SelectorSyntaxError(SyntaxError):

19 pass

20

21 -class ExpressionError(RuntimeError):

22 pass

23

24 -class CSSSelector(etree.XPath):

25 """A CSS selector. 26 27 Usage:: 28 29 >>> from lxml import etree, cssselect 30 >>> select = cssselect.CSSSelector("a tag > child") 31 32 >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") 33 >>> [ el.tag for el in select(root) ] 34 ['child'] 35 36 To use CSS namespaces, you need to pass a prefix-to-namespace 37 mapping as ``namespaces`` keyword argument:: 38 39 >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' 40 >>> select_ns = cssselect.CSSSelector('root > rdf|Description', 41 ... namespaces={'rdf': rdfns}) 42 43 >>> rdf = etree.XML(( 44 ... '<root xmlns:rdf="%s">' 45 ... '<rdf:Description>blah</rdf:Description>' 46 ... '</root>') % rdfns) 47 >>> [(el.tag, el.text) for el in select_ns(rdf)] 48 [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] 49 """

50 - def __init__(self, css, namespaces=None):

51 path = css_to_xpath(css) 52 etree.XPath.__init__(self, path, namespaces=namespaces) 53 self.css = css

54

55 - def __repr__(self):

56 return '<%s %s for %r>' % ( 57 self.__class__.__name__, 58 hex(abs(id(self)))[2:], 59 self.css)

60 61 ############################## 62 ## Token objects: 63 64 try: 65 _unicode = unicode 66 _unichr = unichr 67 except NameError: 68 # Python 3 69 _unicode = str 70 _unichr = chr 71

72 -class _UniToken(_unicode):

73 - def __new__(cls, contents, pos):

74 obj = _unicode.__new__(cls, contents) 75 obj.pos = pos 76 return obj

77

78 - def __repr__(self):

79 return '%s(%s, %r)' % ( 80 self.__class__.__name__, 81 _unicode.__repr__(self), 82 self.pos)

83

84 -class Symbol(_UniToken):

85 pass

86

87 -class String(_UniToken):

88 pass

89

90 -class Token(_UniToken):

91 pass

92 93 ############################################################ 94 ## Parsing 95 ############################################################ 96 97 ############################## 98 ## Syntax objects: 99

100 -class Class(object):

101 """ 102 Represents selector.class_name 103 """ 104

105 - def __init__(self, selector, class_name):

106 self.selector = selector 107 self.class_name = class_name

108

109 - def __repr__(self):

110 return '%s[%r.%s]' % ( 111 self.__class__.__name__, 112 self.selector, 113 self.class_name)

114

115 - def xpath(self):

116 sel_xpath = self.selector.xpath() 117 sel_xpath.add_condition( 118 "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) 119 return sel_xpath

120

121 -class Function(object):

122 """ 123 Represents selector:name(expr) 124 """ 125 126 unsupported = [ 127 'target', 'lang', 'enabled', 'disabled',] 128

129 - def __init__(self, selector, type, name, expr):

130 self.selector = selector 131 self.type = type 132 self.name = name 133 self.expr = expr

134

135 - def __repr__(self):

136 return '%s[%r%s%s(%r)]' % ( 137 self.__class__.__name__, 138 self.selector, 139 self.type, self.name, self.expr)

140

141 - def xpath(self):

142 sel_path = self.selector.xpath() 143 if self.name in self.unsupported: 144 raise ExpressionError( 145 "The pseudo-class %r is not supported" % self.name) 146 method = '_xpath_' + self.name.replace('-', '_') 147 if not hasattr(self, method): 148 raise ExpressionError( 149 "The pseudo-class %r is unknown" % self.name) 150 method = getattr(self, method) 151 return method(sel_path, self.expr)

152

153 - def _xpath_nth_child(self, xpath, expr, last=False, 154 add_name_test=True):

155 a, b = parse_series(expr) 156 if not a and not b and not last: 157 # a=0 means nothing is returned... 158 xpath.add_condition('false() and position() = 0') 159 return xpath 160 if add_name_test: 161 xpath.add_name_test() 162 xpath.add_star_prefix() 163 if a == 0: 164 if last: 165 b = 'last() - %s' % b 166 xpath.add_condition('position() = %s' % b) 167 return xpath 168 if last: 169 # FIXME: I'm not sure if this is right 170 a = -a 171 b = -b 172 if b > 0: 173 b_neg = str(-b) 174 else: 175 b_neg = '+%s' % (-b) 176 if a != 1: 177 expr = ['(position() %s) mod %s = 0' % (b_neg, a)] 178 else: 179 expr = [] 180 if b >= 0: 181 expr.append('position() >= %s' % b) 182 elif b < 0 and last: 183 expr.append('position() < (last() %s)' % b) 184 expr = ' and '.join(expr) 185 if expr: 186 xpath.add_condition(expr) 187 return xpath

188 # FIXME: handle an+b, odd, even 189 # an+b means every-a, plus b, e.g., 2n+1 means odd 190 # 0n+b means b 191 # n+0 means a=1, i.e., all elements 192 # an means every a elements, i.e., 2n means even 193 # -n means -1n 194 # -1n+6 means elements 6 and previous 195

196 - def _xpath_nth_last_child(self, xpath, expr):

197 return self._xpath_nth_child(xpath, expr, last=True)

198

199 - def _xpath_nth_of_type(self, xpath, expr):

200 if xpath.element == '*': 201 raise NotImplementedError( 202 "*:nth-of-type() is not implemented") 203 return self._xpath_nth_child(xpath, expr, add_name_test=False)

204

205 - def _xpath_nth_last_of_type(self, xpath, expr):

206 return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)

207

208 - def _xpath_contains(self, xpath, expr):

209 # text content, minus tags, must contain expr 210 if isinstance(expr, Element): 211 expr = expr._format_element() 212 xpath.add_condition('contains(css:lower-case(string(.)), %s)' 213 % xpath_literal(expr.lower())) 214 # FIXME: Currently case insensitive matching doesn't seem to be happening 215 return xpath

216

217 - def _xpath_not(self, xpath, expr):

218 # everything for which not expr applies 219 expr = expr.xpath() 220 cond = expr.condition 221 # FIXME: should I do something about element_path? 222 xpath.add_condition('not(%s)' % cond) 223 return xpath

224

225 -def _make_lower_case(context, s):

226 return s.lower()

227 228 ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') 229 ns.prefix = 'css' 230 ns['lower-case'] = _make_lower_case 231

232 -class Pseudo(object):

233 """ 234 Represents selector:ident 235 """ 236 237 unsupported = ['indeterminate', 'first-line', 'first-letter', 238 'selection', 'before', 'after', 'link', 'visited', 239 'active', 'focus', 'hover'] 240

241 - def __init__(self, element, type, ident):

242 self.element = element 243 assert type in (':', '::') 244 self.type = type 245 self.ident = ident

246

247 - def __repr__(self):

248 return '%s[%r%s%s]' % ( 249 self.__class__.__name__, 250 self.element, 251 self.type, self.ident)

252

253 - def xpath(self):

254 el_xpath = self.element.xpath() 255 if self.ident in self.unsupported: 256 raise ExpressionError( 257 "The pseudo-class %r is unsupported" % self.ident) 258 method = '_xpath_' + self.ident.replace('-', '_') 259 if not hasattr(self, method): 260 raise ExpressionError( 261 "The pseudo-class %r is unknown" % self.ident) 262 method = getattr(self, method) 263 el_xpath = method(el_xpath) 264 return el_xpath

265

266 - def _xpath_checked(self, xpath):

267 # FIXME: is this really all the elements? 268 xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") 269 return xpath

270

271 - def _xpath_root(self, xpath):

272 # if this element is the root element 273 raise NotImplementedError

274

275 - def _xpath_first_child(self, xpath):

276 xpath.add_star_prefix() 277 xpath.add_name_test() 278 xpath.add_condition('position() = 1') 279 return xpath

280

281 - def _xpath_last_child(self, xpath):

282 xpath.add_star_prefix() 283 xpath.add_name_test() 284 xpath.add_condition('position() = last()') 285 return xpath

286

287 - def _xpath_first_of_type(self, xpath):

288 if xpath.element == '*': 289 raise NotImplementedError( 290 "*:first-of-type is not implemented") 291 xpath.add_star_prefix() 292 xpath.add_condition('position() = 1') 293 return xpath

294

295 - def _xpath_last_of_type(self, xpath):

296 if xpath.element == '*': 297 raise NotImplementedError( 298 "*:last-of-type is not implemented") 299 xpath.add_star_prefix() 300 xpath.add_condition('position() = last()') 301 return xpath

302

303 - def _xpath_only_child(self, xpath):

304 xpath.add_name_test() 305 xpath.add_star_prefix() 306 xpath.add_condition('last() = 1') 307 return xpath

308

309 - def _xpath_only_of_type(self, xpath):

310 if xpath.element == '*': 311 raise NotImplementedError( 312 "*:only-of-type is not implemented") 313 xpath.add_condition('last() = 1') 314 return xpath

315

316 - def _xpath_empty(self, xpath):

317 xpath.add_condition("not(*) and not(normalize-space())") 318 return xpath

319

320 -class Attrib(object):

321 """ 322 Represents selector[namespace|attrib operator value] 323 """ 324

325 - def __init__(self, selector, namespace, attrib, operator, value):

326 self.selector = selector 327 self.namespace = namespace 328 self.attrib = attrib 329 self.operator = operator 330 self.value = value

331

332 - def __repr__(self):

333 if self.operator == 'exists': 334 return '%s[%r[%s]]' % ( 335 self.__class__.__name__, 336 self.selector, 337 self._format_attrib()) 338 else: 339 return '%s[%r[%s %s %r]]' % ( 340 self.__class__.__name__, 341 self.selector, 342 self._format_attrib(), 343 self.operator, 344 self.value)

345

346 - def _format_attrib(self):

347 if self.namespace == '*': 348 return self.attrib 349 else: 350 return '%s|%s' % (self.namespace, self.attrib)

351

352 - def _xpath_attrib(self):

353 # FIXME: if attrib is *? 354 if self.namespace == '*': 355 return '@' + self.attrib 356 else: 357 return '@%s:%s' % (self.namespace, self.attrib)

358

359 - def xpath(self):

360 path = self.selector.xpath() 361 attrib = self._xpath_attrib() 362 value = self.value 363 if self.operator == 'exists': 364 assert not value 365 path.add_condition(attrib) 366 elif self.operator == '=': 367 path.add_condition('%s = %s' % (attrib, 368 xpath_literal(value))) 369 elif self.operator == '!=': 370 # FIXME: this seems like a weird hack... 371 if value: 372 path.add_condition('not(%s) or %s != %s' 373 % (attrib, attrib, xpath_literal(value))) 374 else: 375 path.add_condition('%s != %s' 376 % (attrib, xpath_literal(value))) 377 #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) 378 elif self.operator == '~=': 379 path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) 380 elif self.operator == '|=': 381 # Weird, but true... 382 path.add_condition('%s = %s or starts-with(%s, %s)' % ( 383 attrib, xpath_literal(value), 384 attrib, xpath_literal(value + '-'))) 385 elif self.operator == '^=': 386 path.add_condition('starts-with(%s, %s)' % ( 387 attrib, xpath_literal(value))) 388 elif self.operator == '$=': 389 # Oddly there is a starts-with in XPath 1.0, but not ends-with 390 path.add_condition('substring(%s, string-length(%s)-%s) = %s' 391 % (attrib, attrib, len(value)-1, xpath_literal(value))) 392 elif self.operator == '*=': 393 # FIXME: case sensitive? 394 path.add_condition('contains(%s, %s)' % ( 395 attrib, xpath_literal(value))) 396 else: 397 assert 0, ("Unknown operator: %r" % self.operator) 398 return path

399

400 -class Element(object):

401 """ 402 Represents namespace|element 403 """ 404

405 - def __init__(self, namespace, element):

406 self.namespace = namespace 407 self.element = element

408

409 - def __repr__(self):

410 return '%s[%s]' % ( 411 self.__class__.__name__, 412 self._format_element())

413

414 - def _format_element(self):

415 if self.namespace == '*': 416 return self.element 417 else: 418 return '%s|%s' % (self.namespace, self.element)

419

420 - def xpath(self):

421 if self.namespace == '*': 422 el = self.element.lower() 423 else: 424 # FIXME: Should we lowercase here? 425 el = '%s:%s' % (self.namespace, self.element) 426 return XPathExpr(element=el)

427

428 -class Hash(object):

429 """ 430 Represents selector#id 431 """ 432

433 - def __init__(self, selector, id):

434 self.selector = selector 435 self.id = id

436

437 - def __repr__(self):

438 return '%s[%r#%s]' % ( 439 self.__class__.__name__, 440 self.selector, self.id)

441

442 - def xpath(self):

443 path = self.selector.xpath() 444 path.add_condition('@id = %s' % xpath_literal(self.id)) 445 return path

446

447 -class Or(object):

448

449 - def __init__(self, items):

450 self.items = items

451 - def __repr__(self):

452 return '%s(%r)' % ( 453 self.__class__.__name__, 454 self.items)

455

456 - def xpath(self):

457 paths = [item.xpath() for item in self.items] 458 return XPathExprOr(paths)

459

460 -class CombinedSelector(object):

461 462 _method_mapping = { 463 ' ': 'descendant', 464 '>': 'child', 465 '+': 'direct_adjacent', 466 '~': 'indirect_adjacent', 467 } 468

469 - def __init__(self, selector, combinator, subselector):

470 assert selector is not None 471 self.selector = selector 472 self.combinator = combinator 473 self.subselector = subselector

474

475 - def __repr__(self):

476 if self.combinator == ' ': 477 comb = '<followed>' 478 else: 479 comb = self.combinator 480 return '%s[%r %s %r]' % ( 481 self.__class__.__name__, 482 self.selector, 483 comb, 484 self.subselector)

485

486 - def xpath(self):

487 if self.combinator not in self._method_mapping: 488 raise ExpressionError( 489 "Unknown combinator: %r" % self.combinator) 490 method = '_xpath_' + self._method_mapping[self.combinator] 491 method = getattr(self, method) 492 path = self.selector.xpath() 493 return method(path, self.subselector)

494

495 - def _xpath_descendant(self, xpath, sub):

496 # when sub is a descendant in any way of xpath 497 xpath.join('/descendant-or-self::*/', sub.xpath()) 498 return xpath

499

500 - def _xpath_child(self, xpath, sub):

501 # when sub is an immediate child of xpath 502 xpath.join('/', sub.xpath()) 503 return xpath

504

505 - def _xpath_direct_adjacent(self, xpath, sub):

506 # when sub immediately follows xpath 507 xpath.join('/following-sibling::', sub.xpath()) 508 xpath.add_name_test() 509 xpath.add_condition('position() = 1') 510 return xpath

511

512 - def _xpath_indirect_adjacent(self, xpath, sub):

513 # when sub comes somewhere after xpath as a sibling 514 xpath.join('/following-sibling::', sub.xpath()) 515 return xpath

516 517 ############################## 518 ## XPathExpr objects: 519 520 _el_re = re.compile(r'^\w+\s*$', re.UNICODE) 521 _id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) 522 _class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) 523

524 -def css_to_xpath(css_expr, prefix='descendant-or-self::'):

525 if isinstance(css_expr, _basestring): 526 match = _el_re.search(css_expr) 527 if match is not None: 528 return '%s%s' % (prefix, match.group(0).strip()) 529 match = _id_re.search(css_expr) 530 if match is not None: 531 return "%s%s[@id = '%s']" % ( 532 prefix, match.group(1) or '*', match.group(2)) 533 match = _class_re.search(css_expr) 534 if match is not None: 535 return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( 536 prefix, match.group(1) or '*', match.group(2)) 537 css_expr = parse(css_expr) 538 expr = css_expr.xpath() 539 assert expr is not None, ( 540 "Got None for xpath expression from %s" % repr(css_expr)) 541 if prefix: 542 expr.add_prefix(prefix) 543 return _unicode(expr)

544

545 -class XPathExpr(object):

546

547 - def __init__(self, prefix=None, path=None, element='*', condition=None, 548 star_prefix=False):

549 self.prefix = prefix 550 self.path = path 551 self.element = element 552 self.condition = condition 553 self.star_prefix = star_prefix

554

555 - def __str__(self):

556 path = '' 557 if self.prefix is not None: 558 path += _unicode(self.prefix) 559 if self.path is not None: 560 path += _unicode(self.path) 561 path += _unicode(self.element) 562 if self.condition: 563 path += '[%s]' % self.condition 564 return path

565

566 - def __repr__(self):

567 return '%s[%s]' % ( 568 self.__class__.__name__, self)

569

570 - def add_condition(self, condition):

571 if self.condition: 572 self.condition = '%s and (%s)' % (self.condition, condition) 573 else: 574 self.condition = condition

575

576 - def add_path(self, part):

577 if self.path is None: 578 self.path = self.element 579 else: 580 self.path += self.element 581 self.element = part

582

583 - def add_prefix(self, prefix):

584 if self.prefix: 585 self.prefix = prefix + self.prefix 586 else: 587 self.prefix = prefix

588

589 - def add_name_test(self):

590 if self.element == '*': 591 # We weren't doing a test anyway 592 return 593 self.add_condition("name() = %s" % xpath_literal(self.element)) 594 self.element = '*'

595

596 - def add_star_prefix(self):

597 """ 598 Adds a /* prefix if there is no prefix. This is when you need 599 to keep context's constrained to a single parent. 600 """ 601 if self.path: 602 self.path += '*/' 603 else: 604 self.path = '*/' 605 self.star_prefix = True

606

607 - def join(self, combiner, other):

608 prefix = _unicode(self) 609 prefix += combiner 610 path = (other.prefix or '') + (other.path or '') 611 # We don't need a star prefix if we are joining to this other 612 # prefix; so we'll get rid of it 613 if other.star_prefix and path == '*/': 614 path = '' 615 self.prefix = prefix 616 self.path = path 617 self.element = other.element 618 self.condition = other.condition

619

620 -class XPathExprOr(XPathExpr):

621 """ 622 Represents |'d expressions. Note that unfortunately it isn't 623 the union, it's the sum, so duplicate elements will appear. 624 """ 625

626 - def __init__(self, items, prefix=None):

627 for item in items: 628 assert item is not None 629 self.items = items 630 self.prefix = prefix

631

632 - def __str__(self):

633 prefix = self.prefix or '' 634 return ' | '.join(["%s%s" % (prefix,i) for i in self.items])

635 636 split_at_single_quotes = re.compile("('+)").split 637

638 -def xpath_literal(s):

639 if isinstance(s, Element): 640 # This is probably a symbol that looks like an expression... 641 s = s._format_element() 642 else: 643 s = _unicode(s) 644 if "'" not in s: 645 s = "'%s'" % s 646 elif '"' not in s: 647 s = '"%s"' % s 648 else: 649 s = "concat(%s)" % ','.join([ 650 (("'" in part) and '"%s"' or "'%s'") % part 651 for part in split_at_single_quotes(s) if part 652 ]) 653 return s

654 655 ############################## 656 ## Parsing functions 657

658 -def parse(string):

659 stream = TokenStream(tokenize(string)) 660 stream.source = string 661 try: 662 return parse_selector_group(stream) 663 except SelectorSyntaxError: 664 import sys 665 e = sys.exc_info()[1] 666 message = "%s at %s -> %r" % ( 667 e, stream.used, stream.peek()) 668 e.msg = message 669 if sys.version_info < (2,6): 670 e.message = message 671 e.args = tuple([message]) 672 raise

673

674 -def parse_selector_group(stream):

675 result = [] 676 while 1: 677 result.append(parse_selector(stream)) 678 if stream.peek() == ',': 679 stream.next() 680 else: 681 break 682 if len(result) == 1: 683 return result[0] 684 else: 685 return Or(result)

686

687 -def parse_selector(stream):

688 result = parse_simple_selector(stream) 689 while 1: 690 peek = stream.peek() 691 if peek == ',' or peek is None: 692 return result 693 elif peek in ('+', '>', '~'): 694 # A combinator 695 combinator = stream.next() 696 # Ignore optional whitespace after a combinator 697 while stream.peek() == ' ': 698 stream.next() 699 else: 700 combinator = ' ' 701 consumed = len(stream.used) 702 next_selector = parse_simple_selector(stream) 703 if consumed == len(stream.used): 704 raise SelectorSyntaxError( 705 "Expected selector, got '%s'" % stream.peek()) 706 result = CombinedSelector(result, combinator, next_selector) 707 return result

708

709 -def parse_simple_selector(stream):

710 peek = stream.peek() 711 if peek != '*' and not isinstance(peek, Symbol): 712 element = namespace = '*' 713 else: 714 next = stream.next() 715 if next != '*' and not isinstance(next, Symbol): 716 raise SelectorSyntaxError( 717 "Expected symbol, got '%s'" % next) 718 if stream.peek() == '|': 719 namespace = next 720 stream.next() 721 element = stream.next() 722 if element != '*' and not isinstance(next, Symbol): 723 raise SelectorSyntaxError( 724 "Expected symbol, got '%s'" % next) 725 else: 726 namespace = '*' 727 element = next 728 result = Element(namespace, element) 729 has_hash = False 730 while 1: 731 peek = stream.peek() 732 if peek == '#': 733 if has_hash: 734 # You can't have two hashes 735 # (FIXME: is there some more general rule I'm missing?) 736 break 737 stream.next() 738 result = Hash(result, stream.next()) 739 has_hash = True 740 continue 741 elif peek == '.': 742 stream.next() 743 result = Class(result, stream.next()) 744 continue 745 elif peek == '[': 746 stream.next() 747 result = parse_attrib(result, stream) 748 next = stream.next() 749 if not next == ']': 750 raise SelectorSyntaxError( 751 "] expected, got '%s'" % next) 752 continue 753 elif peek == ':' or peek == '::': 754 type = stream.next() 755 ident = stream.next() 756 if not isinstance(ident, Symbol): 757 raise SelectorSyntaxError( 758 "Expected symbol, got '%s'" % ident) 759 if stream.peek() == '(': 760 stream.next() 761 peek = stream.peek() 762 if isinstance(peek, String): 763 selector = stream.next() 764 elif isinstance(peek, Symbol) and is_int(peek): 765 selector = int(stream.next()) 766 else: 767 # FIXME: parse_simple_selector, or selector, or...? 768 selector = parse_simple_selector(stream) 769 next = stream.next() 770 if not next == ')': 771 raise SelectorSyntaxError( 772 "Expected ')', got '%s' and '%s'" 773 % (next, selector)) 774 result = Function(result, type, ident, selector) 775 else: 776 result = Pseudo(result, type, ident) 777 continue 778 else: 779 if peek == ' ': 780 stream.next() 781 break 782 # FIXME: not sure what "negation" is 783 return result

784

785 -def is_int(v):

786 try: 787 int(v) 788 except ValueError: 789 return False 790 else: 791 return True

792

793 -def parse_attrib(selector, stream):

794 attrib = stream.next() 795 if stream.peek() == '|': 796 namespace = attrib 797 stream.next() 798 attrib = stream.next() 799 else: 800 namespace = '*' 801 if stream.peek() == ']': 802 return Attrib(selector, namespace, attrib, 'exists', None) 803 op = stream.next() 804 if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): 805 raise SelectorSyntaxError( 806 "Operator expected, got '%s'" % op) 807 value = stream.next() 808 if not isinstance(value, (Symbol, String)): 809 raise SelectorSyntaxError( 810 "Expected string or symbol, got '%s'" % value) 811 return Attrib(selector, namespace, attrib, op, value)

812

813 -def parse_series(s):

814 """ 815 Parses things like '1n+2', or 'an+b' generally, returning (a, b) 816 """ 817 if isinstance(s, Element): 818 s = s._format_element() 819 if not s or s == '*': 820 # Happens when there's nothing, which the CSS parser thinks of as * 821 return (0, 0) 822 if isinstance(s, int): 823 # Happens when you just get a number 824 return (0, s) 825 if s == 'odd': 826 return (2, 1) 827 elif s == 'even': 828 return (2, 0) 829 elif s == 'n': 830 return (1, 0) 831 if 'n' not in s: 832 # Just a b 833 return (0, int(s)) 834 a, b = s.split('n', 1) 835 if not a: 836 a = 1 837 elif a == '-' or a == '+': 838 a = int(a+'1') 839 else: 840 a = int(a) 841 if not b: 842 b = 0 843 elif b == '-' or b == '+': 844 b = int(b+'1') 845 else: 846 b = int(b) 847 return (a, b)

848 849 850 ############################################################ 851 ## Tokenizing 852 ############################################################ 853 854 _match_whitespace = re.compile(r'\s+', re.UNICODE).match 855 856 _replace_comments = re.compile(r'/\*.*?\*/', re.DOTALL).sub 857 858 _match_count_number = re.compile(r'[+-]?\d*n(?:[+-]\d+)?').match 859

860 -def tokenize(s):

861 pos = 0 862 s = _replace_comments('', s) 863 while 1: 864 match = _match_whitespace(s, pos=pos) 865 if match: 866 preceding_whitespace_pos = pos 867 pos = match.end() 868 else: 869 preceding_whitespace_pos = 0 870 if pos >= len(s): 871 return 872 match = _match_count_number(s, pos=pos) 873 if match and match.group() != 'n': 874 sym = s[pos:match.end()] 875 yield Symbol(sym, pos) 876 pos = match.end() 877 continue 878 c = s[pos] 879 c2 = s[pos:pos+2] 880 if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): 881 if c2 == '::' and preceding_whitespace_pos > 0: 882 yield Token(' ', preceding_whitespace_pos) 883 yield Token(c2, pos) 884 pos += 2 885 continue 886 if c in '>+~,.*=[]()|:#': 887 if c in ':.#[' and preceding_whitespace_pos > 0: 888 yield Token(' ', preceding_whitespace_pos) 889 yield Token(c, pos) 890 pos += 1 891 continue 892 if c == '"' or c == "'": 893 # Quoted string 894 old_pos = pos 895 sym, pos = tokenize_escaped_string(s, pos) 896 yield String(sym, old_pos) 897 continue 898 old_pos = pos 899 sym, pos = tokenize_symbol(s, pos) 900 yield Symbol(sym, old_pos) 901 continue

902 903 split_at_string_escapes = re.compile(r'(\\(?:%s))' 904 % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', 905 '[^A-Fa-f0-9]'])).split 906

907 -def unescape_string_literal(literal):

908 substrings = [] 909 for substring in split_at_string_escapes(literal): 910 if not substring: 911 continue 912 elif '\\' in substring: 913 if substring[0] == '\\' and len(substring) > 1: 914 substring = substring[1:] 915 if substring[0] in '0123456789ABCDEFabcdef': 916 # int() correctly ignores the potentially trailing whitespace 917 substring = _unichr(int(substring, 16)) 918 else: 919 raise SelectorSyntaxError( 920 "Invalid escape sequence %r in string %r" 921 % (substring.split('\\')[1], literal)) 922 substrings.append(substring) 923 return ''.join(substrings)

924

925 -def tokenize_escaped_string(s, pos):

926 quote = s[pos] 927 assert quote in ('"', "'") 928 pos = pos+1 929 start = pos 930 while 1: 931 next = s.find(quote, pos) 932 if next == -1: 933 raise SelectorSyntaxError( 934 "Expected closing %s for string in: %r" 935 % (quote, s[start:])) 936 result = s[start:next] 937 if result.endswith('\\'): 938 # next quote character is escaped 939 pos = next+1 940 continue 941 if '\\' in result: 942 result = unescape_string_literal(result) 943 return result, next+1

944 945 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) 946

947 -def tokenize_symbol(s, pos):

948 start = pos 949 match = _illegal_symbol.search(s, pos=pos) 950 if not match: 951 # Goes to end of s 952 return s[start:], len(s) 953 if match.start() == pos: 954 assert 0, ( 955 "Unexpected symbol: %r at %s" % (s[pos], pos)) 956 if not match: 957 result = s[start:] 958 pos = len(s) 959 else: 960 result = s[start:match.start()] 961 pos = match.start() 962 try: 963 result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') 964 except UnicodeDecodeError: 965 import sys 966 e = sys.exc_info()[1] 967 raise SelectorSyntaxError( 968 "Bad symbol %r: %s" % (result, e)) 969 return result, pos

970

971 -class TokenStream(object):

972

973 - def __init__(self, tokens, source=None):

974 self.used = [] 975 self.tokens = iter(tokens) 976 self.source = source 977 self.peeked = None 978 self._peeking = False 979 try: 980 self.next_token = self.tokens.next 981 except AttributeError: 982 # Python 3 983 self.next_token = self.tokens.__next__

984

985 - def next(self):

986 if self._peeking: 987 self._peeking = False 988 self.used.append(self.peeked) 989 return self.peeked 990 else: 991 try: 992 next = self.next_token() 993 self.used.append(next) 994 return next 995 except StopIteration: 996 return None

997

998 - def __iter__(self):

999 return iter(self.next, None)

1000

1001 - def peek(self):

1002 if not self._peeking: 1003 try: 1004 self.peeked = self.next_token() 1005 except StopIteration: 1006 return None 1007 self._peeking = True 1008 return self.peeked

1009

Source Code for Module lxml.cssselect