# -*- coding: utf-8 -*- """ cssselect.parser ================ Tokenizer, parser and parsed objects for CSS selectors. :copyright: (c) 2007-2012 Ian Bicking and contributors. See AUTHORS for more details. :license: BSD, see LICENSE for more details. """ import sys import re import operator import typing from typing import Iterable, Iterator, List, Optional, Sequence, Tuple, Union def ascii_lower(string: str) -> str: """Lower-case, but only in the ASCII range.""" return string.encode("utf8").lower().decode("utf8") class SelectorError(Exception): """Common parent for :class:`SelectorSyntaxError` and :class:`ExpressionError`. You can just use ``except SelectorError:`` when calling :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types. """ class SelectorSyntaxError(SelectorError, SyntaxError): """Parsing a selector that does not match the grammar.""" #### Parsed objects Tree = Union[ "Element", "Hash", "Class", "Function", "Pseudo", "Attrib", "Negation", "Relation", "Matching", "SpecificityAdjustment", "CombinedSelector", ] PseudoElement = Union["FunctionalPseudoElement", str] class Selector: """ Represents a parsed selector. :meth:`~GenericTranslator.selector_to_xpath` accepts this object, but ignores :attr:`pseudo_element`. It is the user’s responsibility to account for pseudo-elements and reject selectors with unknown or unsupported pseudo-elements. """ def __init__(self, tree: Tree, pseudo_element: Optional[PseudoElement] = None) -> None: self.parsed_tree = tree if pseudo_element is not None and not isinstance(pseudo_element, FunctionalPseudoElement): pseudo_element = ascii_lower(pseudo_element) #: A :class:`FunctionalPseudoElement`, #: or the identifier for the pseudo-element as a string, # or ``None``. #: #: +-------------------------+----------------+--------------------------------+ #: | | Selector | Pseudo-element | #: +=========================+================+================================+ #: | CSS3 syntax | ``a::before`` | ``'before'`` | #: +-------------------------+----------------+--------------------------------+ #: | Older syntax | ``a:before`` | ``'before'`` | #: +-------------------------+----------------+--------------------------------+ #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | #: | not in Selectors3 | | | #: +-------------------------+----------------+--------------------------------+ #: | Invalid pseudo-class | ``li:marker`` | ``None`` | #: +-------------------------+----------------+--------------------------------+ #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | #: +-------------------------+----------------+--------------------------------+ #: #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement self.pseudo_element = pseudo_element def __repr__(self) -> str: if isinstance(self.pseudo_element, FunctionalPseudoElement): pseudo_element = repr(self.pseudo_element) elif self.pseudo_element: pseudo_element = "::%s" % self.pseudo_element else: pseudo_element = "" return "%s[%r%s]" % (self.__class__.__name__, self.parsed_tree, pseudo_element) def canonical(self) -> str: """Return a CSS representation for this selector (a string)""" if isinstance(self.pseudo_element, FunctionalPseudoElement): pseudo_element = "::%s" % self.pseudo_element.canonical() elif self.pseudo_element: pseudo_element = "::%s" % self.pseudo_element else: pseudo_element = "" res = "%s%s" % (self.parsed_tree.canonical(), pseudo_element) if len(res) > 1: res = res.lstrip("*") return res def specificity(self) -> Tuple[int, int, int]: """Return the specificity_ of this selector as a tuple of 3 integers. .. _specificity: http://www.w3.org/TR/selectors/#specificity """ a, b, c = self.parsed_tree.specificity() if self.pseudo_element: c += 1 return a, b, c class Class: """ Represents selector.class_name """ def __init__(self, selector: Tree, class_name: str) -> None: self.selector = selector self.class_name = class_name def __repr__(self) -> str: return "%s[%r.%s]" % (self.__class__.__name__, self.selector, self.class_name) def canonical(self) -> str: return "%s.%s" % (self.selector.canonical(), self.class_name) def specificity(self) -> Tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c class FunctionalPseudoElement: """ Represents selector::name(arguments) .. attribute:: name The name (identifier) of the pseudo-element, as a string. .. attribute:: arguments The arguments of the pseudo-element, as a list of tokens. **Note:** tokens are not part of the public API, and may change between cssselect versions. Use at your own risks. """ def __init__(self, name: str, arguments: Sequence["Token"]): self.name = ascii_lower(name) self.arguments = arguments def __repr__(self) -> str: return "%s[::%s(%r)]" % ( self.__class__.__name__, self.name, [token.value for token in self.arguments], ) def argument_types(self) -> List[str]: return [token.type for token in self.arguments] def canonical(self) -> str: args = "".join(token.css() for token in self.arguments) return "%s(%s)" % (self.name, args) class Function: """ Represents selector:name(expr) """ def __init__(self, selector: Tree, name: str, arguments: Sequence["Token"]) -> None: self.selector = selector self.name = ascii_lower(name) self.arguments = arguments def __repr__(self) -> str: return "%s[%r:%s(%r)]" % ( self.__class__.__name__, self.selector, self.name, [token.value for token in self.arguments], ) def argument_types(self) -> List[str]: return [token.type for token in self.arguments] def canonical(self) -> str: args = "".join(token.css() for token in self.arguments) return "%s:%s(%s)" % (self.selector.canonical(), self.name, args) def specificity(self) -> Tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c class Pseudo: """ Represents selector:ident """ def __init__(self, selector: Tree, ident: str) -> None: self.selector = selector self.ident = ascii_lower(ident) def __repr__(self) -> str: return "%s[%r:%s]" % (self.__class__.__name__, self.selector, self.ident) def canonical(self) -> str: return "%s:%s" % (self.selector.canonical(), self.ident) def specificity(self) -> Tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c class Negation: """ Represents selector:not(subselector) """ def __init__(self, selector: Tree, subselector: Tree) -> None: self.selector = selector self.subselector = subselector def __repr__(self) -> str: return "%s[%r:not(%r)]" % (self.__class__.__name__, self.selector, self.subselector) def canonical(self) -> str: subsel = self.subselector.canonical() if len(subsel) > 1: subsel = subsel.lstrip("*") return "%s:not(%s)" % (self.selector.canonical(), subsel) def specificity(self) -> Tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 class Relation: """ Represents selector:has(subselector) """ def __init__(self, selector: Tree, combinator: "Token", subselector: Selector): self.selector = selector self.combinator = combinator self.subselector = subselector def __repr__(self) -> str: return "%s[%r:has(%r)]" % ( self.__class__.__name__, self.selector, self.subselector, ) def canonical(self) -> str: try: subsel = self.subselector[0].canonical() # type: ignore except TypeError: subsel = self.subselector.canonical() if len(subsel) > 1: subsel = subsel.lstrip("*") return "%s:has(%s)" % (self.selector.canonical(), subsel) def specificity(self) -> Tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() try: a2, b2, c2 = self.subselector[-1].specificity() # type: ignore except TypeError: a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 class Matching: """ Represents selector:is(selector_list) """ def __init__(self, selector: Tree, selector_list: Iterable[Tree]): self.selector = selector self.selector_list = selector_list def __repr__(self) -> str: return "%s[%r:is(%s)]" % ( self.__class__.__name__, self.selector, ", ".join(map(repr, self.selector_list)), ) def canonical(self) -> str: selector_arguments = [] for s in self.selector_list: selarg = s.canonical() selector_arguments.append(selarg.lstrip("*")) return "%s:is(%s)" % (self.selector.canonical(), ", ".join(map(str, selector_arguments))) def specificity(self) -> Tuple[int, int, int]: return max(x.specificity() for x in self.selector_list) class SpecificityAdjustment: """ Represents selector:where(selector_list) Same as selector:is(selector_list), but its specificity is always 0 """ def __init__(self, selector: Tree, selector_list: List[Tree]): self.selector = selector self.selector_list = selector_list def __repr__(self) -> str: return "%s[%r:where(%s)]" % ( self.__class__.__name__, self.selector, ", ".join(map(repr, self.selector_list)), ) def canonical(self) -> str: selector_arguments = [] for s in self.selector_list: selarg = s.canonical() selector_arguments.append(selarg.lstrip("*")) return "%s:where(%s)" % ( self.selector.canonical(), ", ".join(map(str, selector_arguments)), ) def specificity(self) -> Tuple[int, int, int]: return 0, 0, 0 class Attrib: """ Represents selector[namespace|attrib operator value] """ @typing.overload def __init__( self, selector: Tree, namespace: Optional[str], attrib: str, operator: 'typing.Literal["exists"]', value: None, ) -> None: ... @typing.overload def __init__( self, selector: Tree, namespace: Optional[str], attrib: str, operator: str, value: "Token" ) -> None: ... def __init__( self, selector: Tree, namespace: Optional[str], attrib: str, operator: str, value: Optional["Token"], ) -> None: self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value def __repr__(self) -> str: if self.namespace: attrib = "%s|%s" % (self.namespace, self.attrib) else: attrib = self.attrib if self.operator == "exists": return "%s[%r[%s]]" % (self.__class__.__name__, self.selector, attrib) else: return "%s[%r[%s %s %r]]" % ( self.__class__.__name__, self.selector, attrib, self.operator, typing.cast("Token", self.value).value, ) def canonical(self) -> str: if self.namespace: attrib = "%s|%s" % (self.namespace, self.attrib) else: attrib = self.attrib if self.operator == "exists": op = attrib else: op = "%s%s%s" % (attrib, self.operator, typing.cast("Token", self.value).css()) return "%s[%s]" % (self.selector.canonical(), op) def specificity(self) -> Tuple[int, int, int]: a, b, c = self.selector.specificity() b += 1 return a, b, c class Element: """ Represents namespace|element `None` is for the universal selector '*' """ def __init__(self, namespace: Optional[str] = None, element: Optional[str] = None) -> None: self.namespace = namespace self.element = element def __repr__(self) -> str: return "%s[%s]" % (self.__class__.__name__, self.canonical()) def canonical(self) -> str: element = self.element or "*" if self.namespace: element = "%s|%s" % (self.namespace, element) return element def specificity(self) -> Tuple[int, int, int]: if self.element: return 0, 0, 1 else: return 0, 0, 0 class Hash: """ Represents selector#id """ def __init__(self, selector: Tree, id: str) -> None: self.selector = selector self.id = id def __repr__(self) -> str: return "%s[%r#%s]" % (self.__class__.__name__, self.selector, self.id) def canonical(self) -> str: return "%s#%s" % (self.selector.canonical(), self.id) def specificity(self) -> Tuple[int, int, int]: a, b, c = self.selector.specificity() a += 1 return a, b, c class CombinedSelector: def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None: assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector def __repr__(self) -> str: if self.combinator == " ": comb = "" else: comb = self.combinator return "%s[%r %s %r]" % (self.__class__.__name__, self.selector, comb, self.subselector) def canonical(self) -> str: subsel = self.subselector.canonical() if len(subsel) > 1: subsel = subsel.lstrip("*") return "%s %s %s" % (self.selector.canonical(), self.combinator, subsel) def specificity(self) -> Tuple[int, int, int]: a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 #### Parser # foo _el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$") # foo#bar or #bar _id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$") # foo.bar or .bar _class_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$") def parse(css: str) -> List[Selector]: """Parse a CSS *group of selectors*. If you don't care about pseudo-elements or selector specificity, you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. :param css: A *group of selectors* as a string. :raises: :class:`SelectorSyntaxError` on invalid selectors. :returns: A list of parsed :class:`Selector` objects, one for each selector in the comma-separated group. """ # Fast path for simple cases match = _el_re.match(css) if match: return [Selector(Element(element=match.group(1)))] match = _id_re.match(css) if match is not None: return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] match = _class_re.match(css) if match is not None: return [Selector(Class(Element(element=match.group(1) or None), match.group(2)))] stream = TokenStream(tokenize(css)) stream.source = css return list(parse_selector_group(stream)) # except SelectorSyntaxError: # e = sys.exc_info()[1] # message = "%s at %s -> %r" % ( # e, stream.used, stream.peek()) # e.msg = message # e.args = tuple([message]) # raise def parse_selector_group(stream: "TokenStream") -> Iterator[Selector]: stream.skip_whitespace() while 1: yield Selector(*parse_selector(stream)) if stream.peek() == ("DELIM", ","): stream.next() stream.skip_whitespace() else: break def parse_selector(stream: "TokenStream") -> Tuple[Tree, Optional[PseudoElement]]: result, pseudo_element = parse_simple_selector(stream) while 1: stream.skip_whitespace() peek = stream.peek() if peek in (("EOF", None), ("DELIM", ",")): break if pseudo_element: raise SelectorSyntaxError( "Got pseudo-element ::%s not at the end of a selector" % pseudo_element ) if peek.is_delim("+", ">", "~"): # A combinator combinator = typing.cast(str, stream.next().value) stream.skip_whitespace() else: # By exclusion, the last parse_simple_selector() ended # at peek == ' ' combinator = " " next_selector, pseudo_element = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result, pseudo_element def parse_simple_selector( stream: "TokenStream", inside_negation: bool = False ) -> Tuple[Tree, Optional[PseudoElement]]: stream.skip_whitespace() selector_start = len(stream.used) peek = stream.peek() if peek.type == "IDENT" or peek == ("DELIM", "*"): if peek.type == "IDENT": namespace = stream.next().value else: stream.next() namespace = None if stream.peek() == ("DELIM", "|"): stream.next() element = stream.next_ident_or_star() else: element = namespace namespace = None else: element = namespace = None result: Tree = Element(namespace, element) pseudo_element: Optional[PseudoElement] = None while 1: peek = stream.peek() if ( peek.type in ("S", "EOF") or peek.is_delim(",", "+", ">", "~") or (inside_negation and peek == ("DELIM", ")")) ): break if pseudo_element: raise SelectorSyntaxError( "Got pseudo-element ::%s not at the end of a selector" % pseudo_element ) if peek.type == "HASH": result = Hash(result, typing.cast(str, stream.next().value)) elif peek == ("DELIM", "."): stream.next() result = Class(result, stream.next_ident()) elif peek == ("DELIM", "|"): stream.next() result = Element(None, stream.next_ident()) elif peek == ("DELIM", "["): stream.next() result = parse_attrib(result, stream) elif peek == ("DELIM", ":"): stream.next() if stream.peek() == ("DELIM", ":"): stream.next() pseudo_element = stream.next_ident() if stream.peek() == ("DELIM", "("): stream.next() pseudo_element = FunctionalPseudoElement( pseudo_element, parse_arguments(stream) ) continue ident = stream.next_ident() if ident.lower() in ("first-line", "first-letter", "before", "after"): # Special case: CSS 2.1 pseudo-elements can have a single ':' # Any new pseudo-element must have two. pseudo_element = str(ident) continue if stream.peek() != ("DELIM", "("): result = Pseudo(result, ident) if repr(result) == "Pseudo[Element[*]:scope]": if not ( len(stream.used) == 2 or (len(stream.used) == 3 and stream.used[0].type == "S") or (len(stream.used) >= 3 and stream.used[-3].is_delim(",")) or ( len(stream.used) >= 4 and stream.used[-3].type == "S" and stream.used[-4].is_delim(",") ) ): raise SelectorSyntaxError( 'Got immediate child pseudo-element ":scope" ' "not at the start of a selector" ) continue stream.next() stream.skip_whitespace() if ident.lower() == "not": if inside_negation: raise SelectorSyntaxError("Got nested :not()") argument, argument_pseudo_element = parse_simple_selector( stream, inside_negation=True ) next = stream.next() if argument_pseudo_element: raise SelectorSyntaxError( "Got pseudo-element ::%s inside :not() at %s" % (argument_pseudo_element, next.pos) ) if next != ("DELIM", ")"): raise SelectorSyntaxError("Expected ')', got %s" % (next,)) result = Negation(result, argument) elif ident.lower() == "has": combinator, arguments = parse_relative_selector(stream) result = Relation(result, combinator, arguments) elif ident.lower() in ("matches", "is"): selectors = parse_simple_selector_arguments(stream) result = Matching(result, selectors) elif ident.lower() == "where": selectors = parse_simple_selector_arguments(stream) result = SpecificityAdjustment(result, selectors) else: result = Function(result, ident, parse_arguments(stream)) else: raise SelectorSyntaxError("Expected selector, got %s" % (peek,)) if len(stream.used) == selector_start: raise SelectorSyntaxError("Expected selector, got %s" % (stream.peek(),)) return result, pseudo_element def parse_arguments(stream: "TokenStream") -> List["Token"]: arguments: List["Token"] = [] while 1: stream.skip_whitespace() next = stream.next() if next.type in ("IDENT", "STRING", "NUMBER") or next in [("DELIM", "+"), ("DELIM", "-")]: arguments.append(next) elif next == ("DELIM", ")"): return arguments else: raise SelectorSyntaxError("Expected an argument, got %s" % (next,)) def parse_relative_selector(stream: "TokenStream") -> Tuple["Token", Selector]: stream.skip_whitespace() subselector = "" next = stream.next() if next in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]: combinator = next stream.skip_whitespace() next = stream.next() else: combinator = Token("DELIM", " ", pos=0) while 1: if next.type in ("IDENT", "STRING", "NUMBER") or next in [("DELIM", "."), ("DELIM", "*")]: subselector += typing.cast(str, next.value) elif next == ("DELIM", ")"): result = parse(subselector) return combinator, result[0] else: raise SelectorSyntaxError("Expected an argument, got %s" % (next,)) next = stream.next() def parse_simple_selector_arguments(stream: "TokenStream") -> List[Tree]: arguments = [] while 1: result, pseudo_element = parse_simple_selector(stream, True) if pseudo_element: raise SelectorSyntaxError( "Got pseudo-element ::%s inside function" % (pseudo_element,) ) stream.skip_whitespace() next = stream.next() if next in (("EOF", None), ("DELIM", ",")): stream.next() stream.skip_whitespace() arguments.append(result) elif next == ("DELIM", ")"): arguments.append(result) break else: raise SelectorSyntaxError("Expected an argument, got %s" % (next,)) return arguments def parse_attrib(selector: Tree, stream: "TokenStream") -> Attrib: stream.skip_whitespace() attrib = stream.next_ident_or_star() if attrib is None and stream.peek() != ("DELIM", "|"): raise SelectorSyntaxError("Expected '|', got %s" % (stream.peek(),)) namespace: Optional[str] op: Optional[str] if stream.peek() == ("DELIM", "|"): stream.next() if stream.peek() == ("DELIM", "="): namespace = None stream.next() op = "|=" else: namespace = attrib attrib = stream.next_ident() op = None else: namespace = op = None if op is None: stream.skip_whitespace() next = stream.next() if next == ("DELIM", "]"): return Attrib(selector, namespace, typing.cast(str, attrib), "exists", None) elif next == ("DELIM", "="): op = "=" elif next.is_delim("^", "$", "*", "~", "|", "!") and (stream.peek() == ("DELIM", "=")): op = typing.cast(str, next.value) + "=" stream.next() else: raise SelectorSyntaxError("Operator expected, got %s" % (next,)) stream.skip_whitespace() value = stream.next() if value.type not in ("IDENT", "STRING"): raise SelectorSyntaxError("Expected string or ident, got %s" % (value,)) stream.skip_whitespace() next = stream.next() if next != ("DELIM", "]"): raise SelectorSyntaxError("Expected ']', got %s" % (next,)) return Attrib(selector, namespace, typing.cast(str, attrib), op, value) def parse_series(tokens: Iterable["Token"]) -> Tuple[int, int]: """ Parses the arguments for :nth-child() and friends. :raises: A list of tokens :returns: :``(a, b)`` """ for token in tokens: if token.type == "STRING": raise ValueError("String tokens not allowed in series.") s = "".join(typing.cast(str, token.value) for token in tokens).strip() if s == "odd": return 2, 1 elif s == "even": return 2, 0 elif s == "n": return 1, 0 if "n" not in s: # Just b return 0, int(s) a, b = s.split("n", 1) a_as_int: int if not a: a_as_int = 1 elif a == "-" or a == "+": a_as_int = int(a + "1") else: a_as_int = int(a) b_as_int: int if not b: b_as_int = 0 else: b_as_int = int(b) return a_as_int, b_as_int #### Token objects class Token(Tuple[str, Optional[str]]): @typing.overload def __new__( cls, type_: 'typing.Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"]', value: str, pos: int, ) -> "Token": ... @typing.overload def __new__(cls, type_: 'typing.Literal["EOF"]', value: None, pos: int) -> "Token": ... def __new__(cls, type_: str, value: Optional[str], pos: int) -> "Token": obj = tuple.__new__(cls, (type_, value)) obj.pos = pos return obj def __repr__(self) -> str: return "<%s '%s' at %i>" % (self.type, self.value, self.pos) def is_delim(self, *values: str) -> bool: return self.type == "DELIM" and self.value in values pos: int @property def type(self) -> str: return self[0] @property def value(self) -> Optional[str]: return self[1] def css(self) -> str: if self.type == "STRING": return repr(self.value) else: return typing.cast(str, self.value) class EOFToken(Token): def __new__(cls, pos: int) -> "EOFToken": return typing.cast("EOFToken", Token.__new__(cls, "EOF", None, pos)) def __repr__(self) -> str: return "<%s at %i>" % (self.type, self.pos) #### Tokenizer class TokenMacros: unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?" escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]" string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape nonascii = r"[^\0-\177]" nmchar = "[_a-z0-9-]|%s|%s" % (escape, nonascii) nmstart = "[_a-z]|%s|%s" % (escape, nonascii) if typing.TYPE_CHECKING: class MatchFunc(typing.Protocol): def __call__( self, string: str, pos: int = ..., endpos: int = ... ) -> Optional["re.Match[str]"]: ... def _compile(pattern: str) -> "MatchFunc": return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match _match_whitespace = _compile(r"[ \t\r\n\f]+") _match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)") _match_hash = _compile("#(?:%(nmchar)s)+") _match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*") _match_string_by_quote = { "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), } _sub_simple_escape = re.compile(r"\\(.)").sub _sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub _sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub # Same as r'\1', but faster on CPython _replace_simple = operator.methodcaller("group", 1) def _replace_unicode(match: "re.Match[str]") -> str: codepoint = int(match.group(1), 16) if codepoint > sys.maxunicode: codepoint = 0xFFFD return chr(codepoint) def unescape_ident(value: str) -> str: value = _sub_unicode_escape(_replace_unicode, value) value = _sub_simple_escape(_replace_simple, value) return value def tokenize(s: str) -> Iterator[Token]: pos = 0 len_s = len(s) while pos < len_s: match = _match_whitespace(s, pos=pos) if match: yield Token("S", " ", pos) pos = match.end() continue match = _match_ident(s, pos=pos) if match: value = _sub_simple_escape( _replace_simple, _sub_unicode_escape(_replace_unicode, match.group()) ) yield Token("IDENT", value, pos) pos = match.end() continue match = _match_hash(s, pos=pos) if match: value = _sub_simple_escape( _replace_simple, _sub_unicode_escape(_replace_unicode, match.group()[1:]) ) yield Token("HASH", value, pos) pos = match.end() continue quote = s[pos] if quote in _match_string_by_quote: match = _match_string_by_quote[quote](s, pos=pos + 1) assert match, "Should have found at least an empty match" end_pos = match.end() if end_pos == len_s: raise SelectorSyntaxError("Unclosed string at %s" % pos) if s[end_pos] != quote: raise SelectorSyntaxError("Invalid string at %s" % pos) value = _sub_simple_escape( _replace_simple, _sub_unicode_escape(_replace_unicode, _sub_newline_escape("", match.group())), ) yield Token("STRING", value, pos) pos = end_pos + 1 continue match = _match_number(s, pos=pos) if match: value = match.group() yield Token("NUMBER", value, pos) pos = match.end() continue pos2 = pos + 2 if s[pos:pos2] == "/*": pos = s.find("*/", pos2) if pos == -1: pos = len_s else: pos += 2 continue yield Token("DELIM", s[pos], pos) pos += 1 assert pos == len_s yield EOFToken(pos) class TokenStream: def __init__(self, tokens: Iterable[Token], source: Optional[str] = None) -> None: self.used: List[Token] = [] self.tokens = iter(tokens) self.source = source self.peeked: Optional[Token] = None self._peeking = False self.next_token = self.tokens.__next__ def next(self) -> Token: if self._peeking: self._peeking = False self.used.append(typing.cast(Token, self.peeked)) return typing.cast(Token, self.peeked) else: next = self.next_token() self.used.append(next) return next def peek(self) -> Token: if not self._peeking: self.peeked = self.next_token() self._peeking = True return typing.cast(Token, self.peeked) def next_ident(self) -> str: next = self.next() if next.type != "IDENT": raise SelectorSyntaxError("Expected ident, got %s" % (next,)) return typing.cast(str, next.value) def next_ident_or_star(self) -> Optional[str]: next = self.next() if next.type == "IDENT": return next.value elif next == ("DELIM", "*"): return None else: raise SelectorSyntaxError("Expected ident or '*', got %s" % (next,)) def skip_whitespace(self) -> None: peek = self.peek() if peek.type == "S": self.next()