890 lines
32 KiB
Python
890 lines
32 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
cssselect.xpath
|
||
===============
|
||
|
||
Translation of parsed CSS selectors to XPath expressions.
|
||
|
||
|
||
:copyright: (c) 2007-2012 Ian Bicking and contributors.
|
||
See AUTHORS for more details.
|
||
:license: BSD, see LICENSE for more details.
|
||
|
||
"""
|
||
|
||
import re
|
||
import typing
|
||
import warnings
|
||
from typing import Optional
|
||
|
||
from cssselect.parser import (
|
||
parse,
|
||
parse_series,
|
||
PseudoElement,
|
||
Selector,
|
||
SelectorError,
|
||
Tree,
|
||
Element,
|
||
Hash,
|
||
Class,
|
||
Function,
|
||
Pseudo,
|
||
Attrib,
|
||
Negation,
|
||
Relation,
|
||
Matching,
|
||
SpecificityAdjustment,
|
||
CombinedSelector,
|
||
)
|
||
|
||
|
||
@typing.no_type_check
|
||
def _unicode_safe_getattr(obj, name, default=None):
|
||
warnings.warn(
|
||
"_unicode_safe_getattr is deprecated and will be removed in the"
|
||
" next release, use getattr() instead",
|
||
DeprecationWarning,
|
||
stacklevel=2,
|
||
)
|
||
return getattr(obj, name, default)
|
||
|
||
|
||
class ExpressionError(SelectorError, RuntimeError):
|
||
"""Unknown or unsupported selector (eg. pseudo-class)."""
|
||
|
||
|
||
#### XPath Helpers
|
||
|
||
|
||
class XPathExpr:
|
||
def __init__(
|
||
self, path: str = "", element: str = "*", condition: str = "", star_prefix: bool = False
|
||
) -> None:
|
||
self.path = path
|
||
self.element = element
|
||
self.condition = condition
|
||
|
||
def __str__(self) -> str:
|
||
path = str(self.path) + str(self.element)
|
||
if self.condition:
|
||
path += "[%s]" % self.condition
|
||
return path
|
||
|
||
def __repr__(self) -> str:
|
||
return "%s[%s]" % (self.__class__.__name__, self)
|
||
|
||
def add_condition(self, condition: str, conjuction: str = "and") -> "XPathExpr":
|
||
if self.condition:
|
||
self.condition = "(%s) %s (%s)" % (self.condition, conjuction, condition)
|
||
else:
|
||
self.condition = condition
|
||
return self
|
||
|
||
def add_name_test(self) -> None:
|
||
if self.element == "*":
|
||
# We weren't doing a test anyway
|
||
return
|
||
self.add_condition("name() = %s" % GenericTranslator.xpath_literal(self.element))
|
||
self.element = "*"
|
||
|
||
def add_star_prefix(self) -> None:
|
||
"""
|
||
Append '*/' to the path to keep the context constrained
|
||
to a single parent.
|
||
"""
|
||
self.path += "*/"
|
||
|
||
def join(
|
||
self,
|
||
combiner: str,
|
||
other: "XPathExpr",
|
||
closing_combiner: Optional[str] = None,
|
||
has_inner_condition: bool = False,
|
||
) -> "XPathExpr":
|
||
path = str(self) + combiner
|
||
# Any "star prefix" is redundant when joining.
|
||
if other.path != "*/":
|
||
path += other.path
|
||
self.path = path
|
||
if not has_inner_condition:
|
||
self.element = other.element + closing_combiner if closing_combiner else other.element
|
||
self.condition = other.condition
|
||
else:
|
||
self.element = other.element
|
||
if other.condition:
|
||
self.element += "[" + other.condition + "]"
|
||
if closing_combiner:
|
||
self.element += closing_combiner
|
||
return self
|
||
|
||
|
||
split_at_single_quotes = re.compile("('+)").split
|
||
|
||
# The spec is actually more permissive than that, but don’t bother.
|
||
# This is just for the fast path.
|
||
# http://www.w3.org/TR/REC-xml/#NT-NameStartChar
|
||
is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match
|
||
|
||
# Test that the string is not empty and does not contain whitespace
|
||
is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match
|
||
|
||
|
||
#### Translation
|
||
|
||
|
||
class GenericTranslator:
|
||
"""
|
||
Translator for "generic" XML documents.
|
||
|
||
Everything is case-sensitive, no assumption is made on the meaning
|
||
of element names and attribute names.
|
||
|
||
"""
|
||
|
||
####
|
||
#### HERE BE DRAGONS
|
||
####
|
||
#### You are welcome to hook into this to change some behavior,
|
||
#### but do so at your own risks.
|
||
#### Until it has received a lot more work and review,
|
||
#### I reserve the right to change this API in backward-incompatible ways
|
||
#### with any minor version of cssselect.
|
||
#### See https://github.com/scrapy/cssselect/pull/22
|
||
#### -- Simon Sapin.
|
||
####
|
||
|
||
combinator_mapping = {
|
||
" ": "descendant",
|
||
">": "child",
|
||
"+": "direct_adjacent",
|
||
"~": "indirect_adjacent",
|
||
}
|
||
|
||
attribute_operator_mapping = {
|
||
"exists": "exists",
|
||
"=": "equals",
|
||
"~=": "includes",
|
||
"|=": "dashmatch",
|
||
"^=": "prefixmatch",
|
||
"$=": "suffixmatch",
|
||
"*=": "substringmatch",
|
||
"!=": "different", # XXX Not in Level 3 but meh
|
||
}
|
||
|
||
#: The attribute used for ID selectors depends on the document language:
|
||
#: http://www.w3.org/TR/selectors/#id-selectors
|
||
id_attribute = "id"
|
||
|
||
#: The attribute used for ``:lang()`` depends on the document language:
|
||
#: http://www.w3.org/TR/selectors/#lang-pseudo
|
||
lang_attribute = "xml:lang"
|
||
|
||
#: The case sensitivity of document language element names,
|
||
#: attribute names, and attribute values in selectors depends
|
||
#: on the document language.
|
||
#: http://www.w3.org/TR/selectors/#casesens
|
||
#:
|
||
#: When a document language defines one of these as case-insensitive,
|
||
#: cssselect assumes that the document parser makes the parsed values
|
||
#: lower-case. Making the selector lower-case too makes the comparaison
|
||
#: case-insensitive.
|
||
#:
|
||
#: In HTML, element names and attributes names (but not attribute values)
|
||
#: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4
|
||
#: and HTMLParser make them lower-case in their parse result, so
|
||
#: the assumption holds.
|
||
lower_case_element_names = False
|
||
lower_case_attribute_names = False
|
||
lower_case_attribute_values = False
|
||
|
||
# class used to represent and xpath expression
|
||
xpathexpr_cls = XPathExpr
|
||
|
||
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
||
"""Translate a *group of selectors* to XPath.
|
||
|
||
Pseudo-elements are not supported here since XPath only knows
|
||
about "real" elements.
|
||
|
||
:param css:
|
||
A *group of selectors* as a string.
|
||
:param prefix:
|
||
This string is prepended to the XPath expression for each selector.
|
||
The default makes selectors scoped to the context node’s subtree.
|
||
:raises:
|
||
:class:`~cssselect.SelectorSyntaxError` on invalid selectors,
|
||
:class:`ExpressionError` on unknown/unsupported selectors,
|
||
including pseudo-elements.
|
||
:returns:
|
||
The equivalent XPath 1.0 expression as a string.
|
||
|
||
"""
|
||
return " | ".join(
|
||
self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True)
|
||
for selector in parse(css)
|
||
)
|
||
|
||
def selector_to_xpath(
|
||
self,
|
||
selector: Selector,
|
||
prefix: str = "descendant-or-self::",
|
||
translate_pseudo_elements: bool = False,
|
||
) -> str:
|
||
"""Translate a parsed selector to XPath.
|
||
|
||
|
||
:param selector:
|
||
A parsed :class:`Selector` object.
|
||
:param prefix:
|
||
This string is prepended to the resulting XPath expression.
|
||
The default makes selectors scoped to the context node’s subtree.
|
||
:param translate_pseudo_elements:
|
||
Unless this is set to ``True`` (as :meth:`css_to_xpath` does),
|
||
the :attr:`~Selector.pseudo_element` attribute of the selector
|
||
is ignored.
|
||
It is the caller's responsibility to reject selectors
|
||
with pseudo-elements, or to account for them somehow.
|
||
:raises:
|
||
:class:`ExpressionError` on unknown/unsupported selectors.
|
||
:returns:
|
||
The equivalent XPath 1.0 expression as a string.
|
||
|
||
"""
|
||
tree = getattr(selector, "parsed_tree", None)
|
||
if not tree:
|
||
raise TypeError("Expected a parsed selector, got %r" % (selector,))
|
||
xpath = self.xpath(tree)
|
||
assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return'
|
||
if translate_pseudo_elements and selector.pseudo_element:
|
||
xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element)
|
||
return (prefix or "") + str(xpath)
|
||
|
||
def xpath_pseudo_element(self, xpath: XPathExpr, pseudo_element: PseudoElement) -> XPathExpr:
|
||
"""Translate a pseudo-element.
|
||
|
||
Defaults to not supporting pseudo-elements at all,
|
||
but can be overridden by sub-classes.
|
||
|
||
"""
|
||
raise ExpressionError("Pseudo-elements are not supported.")
|
||
|
||
@staticmethod
|
||
def xpath_literal(s: str) -> str:
|
||
s = str(s)
|
||
if "'" not in s:
|
||
s = "'%s'" % s
|
||
elif '"' not in s:
|
||
s = '"%s"' % s
|
||
else:
|
||
s = "concat(%s)" % ",".join(
|
||
[
|
||
(("'" in part) and '"%s"' or "'%s'") % part
|
||
for part in split_at_single_quotes(s)
|
||
if part
|
||
]
|
||
)
|
||
return s
|
||
|
||
def xpath(self, parsed_selector: Tree) -> XPathExpr:
|
||
"""Translate any parsed selector object."""
|
||
type_name = type(parsed_selector).__name__
|
||
method = getattr(self, "xpath_%s" % type_name.lower(), None)
|
||
if method is None:
|
||
raise ExpressionError("%s is not supported." % type_name)
|
||
return typing.cast(XPathExpr, method(parsed_selector))
|
||
|
||
# Dispatched by parsed object type
|
||
|
||
def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr:
|
||
"""Translate a combined selector."""
|
||
combinator = self.combinator_mapping[combined.combinator]
|
||
method = getattr(self, "xpath_%s_combinator" % combinator)
|
||
return typing.cast(
|
||
XPathExpr, method(self.xpath(combined.selector), self.xpath(combined.subselector))
|
||
)
|
||
|
||
def xpath_negation(self, negation: Negation) -> XPathExpr:
|
||
xpath = self.xpath(negation.selector)
|
||
sub_xpath = self.xpath(negation.subselector)
|
||
sub_xpath.add_name_test()
|
||
if sub_xpath.condition:
|
||
return xpath.add_condition("not(%s)" % sub_xpath.condition)
|
||
else:
|
||
return xpath.add_condition("0")
|
||
|
||
def xpath_relation(self, relation: Relation) -> XPathExpr:
|
||
xpath = self.xpath(relation.selector)
|
||
combinator = relation.combinator
|
||
subselector = relation.subselector
|
||
right = self.xpath(subselector.parsed_tree)
|
||
method = getattr(
|
||
self,
|
||
"xpath_relation_%s_combinator"
|
||
% self.combinator_mapping[typing.cast(str, combinator.value)],
|
||
)
|
||
return typing.cast(XPathExpr, method(xpath, right))
|
||
|
||
def xpath_matching(self, matching: Matching) -> XPathExpr:
|
||
xpath = self.xpath(matching.selector)
|
||
exprs = [self.xpath(selector) for selector in matching.selector_list]
|
||
for e in exprs:
|
||
e.add_name_test()
|
||
if e.condition:
|
||
xpath.add_condition(e.condition, "or")
|
||
return xpath
|
||
|
||
def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr:
|
||
xpath = self.xpath(matching.selector)
|
||
exprs = [self.xpath(selector) for selector in matching.selector_list]
|
||
for e in exprs:
|
||
e.add_name_test()
|
||
if e.condition:
|
||
xpath.add_condition(e.condition, "or")
|
||
return xpath
|
||
|
||
def xpath_function(self, function: Function) -> XPathExpr:
|
||
"""Translate a functional pseudo-class."""
|
||
method_name = "xpath_%s_function" % function.name.replace("-", "_")
|
||
method = getattr(self, method_name, None)
|
||
if not method:
|
||
raise ExpressionError("The pseudo-class :%s() is unknown" % function.name)
|
||
return typing.cast(XPathExpr, method(self.xpath(function.selector), function))
|
||
|
||
def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr:
|
||
"""Translate a pseudo-class."""
|
||
method_name = "xpath_%s_pseudo" % pseudo.ident.replace("-", "_")
|
||
method = getattr(self, method_name, None)
|
||
if not method:
|
||
# TODO: better error message for pseudo-elements?
|
||
raise ExpressionError("The pseudo-class :%s is unknown" % pseudo.ident)
|
||
return typing.cast(XPathExpr, method(self.xpath(pseudo.selector)))
|
||
|
||
def xpath_attrib(self, selector: Attrib) -> XPathExpr:
|
||
"""Translate an attribute selector."""
|
||
operator = self.attribute_operator_mapping[selector.operator]
|
||
method = getattr(self, "xpath_attrib_%s" % operator)
|
||
if self.lower_case_attribute_names:
|
||
name = selector.attrib.lower()
|
||
else:
|
||
name = selector.attrib
|
||
safe = is_safe_name(name)
|
||
if selector.namespace:
|
||
name = "%s:%s" % (selector.namespace, name)
|
||
safe = safe and is_safe_name(selector.namespace)
|
||
if safe:
|
||
attrib = "@" + name
|
||
else:
|
||
attrib = "attribute::*[name() = %s]" % self.xpath_literal(name)
|
||
if selector.value is None:
|
||
value = None
|
||
elif self.lower_case_attribute_values:
|
||
value = typing.cast(str, selector.value.value).lower()
|
||
else:
|
||
value = selector.value.value
|
||
return typing.cast(XPathExpr, method(self.xpath(selector.selector), attrib, value))
|
||
|
||
def xpath_class(self, class_selector: Class) -> XPathExpr:
|
||
"""Translate a class selector."""
|
||
# .foo is defined as [class~=foo] in the spec.
|
||
xpath = self.xpath(class_selector.selector)
|
||
return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name)
|
||
|
||
def xpath_hash(self, id_selector: Hash) -> XPathExpr:
|
||
"""Translate an ID selector."""
|
||
xpath = self.xpath(id_selector.selector)
|
||
return self.xpath_attrib_equals(xpath, "@id", id_selector.id)
|
||
|
||
def xpath_element(self, selector: Element) -> XPathExpr:
|
||
"""Translate a type or universal selector."""
|
||
element = selector.element
|
||
if not element:
|
||
element = "*"
|
||
safe = True
|
||
else:
|
||
safe = bool(is_safe_name(element))
|
||
if self.lower_case_element_names:
|
||
element = element.lower()
|
||
if selector.namespace:
|
||
# Namespace prefixes are case-sensitive.
|
||
# http://www.w3.org/TR/css3-namespace/#prefixes
|
||
element = "%s:%s" % (selector.namespace, element)
|
||
safe = safe and bool(is_safe_name(selector.namespace))
|
||
xpath = self.xpathexpr_cls(element=element)
|
||
if not safe:
|
||
xpath.add_name_test()
|
||
return xpath
|
||
|
||
# CombinedSelector: dispatch by combinator
|
||
|
||
def xpath_descendant_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
|
||
"""right is a child, grand-child or further descendant of left"""
|
||
return left.join("/descendant-or-self::*/", right)
|
||
|
||
def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
|
||
"""right is an immediate child of left"""
|
||
return left.join("/", right)
|
||
|
||
def xpath_direct_adjacent_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
|
||
"""right is a sibling immediately after left"""
|
||
xpath = left.join("/following-sibling::", right)
|
||
xpath.add_name_test()
|
||
return xpath.add_condition("position() = 1")
|
||
|
||
def xpath_indirect_adjacent_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
|
||
"""right is a sibling after left, immediately or not"""
|
||
return left.join("/following-sibling::", right)
|
||
|
||
def xpath_relation_descendant_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
|
||
"""right is a child, grand-child or further descendant of left; select left"""
|
||
return left.join("[descendant::", right, closing_combiner="]", has_inner_condition=True)
|
||
|
||
def xpath_relation_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr:
|
||
"""right is an immediate child of left; select left"""
|
||
return left.join("[./", right, closing_combiner="]")
|
||
|
||
def xpath_relation_direct_adjacent_combinator(
|
||
self, left: XPathExpr, right: XPathExpr
|
||
) -> XPathExpr:
|
||
"""right is a sibling immediately after left; select left"""
|
||
xpath = left.add_condition(
|
||
"following-sibling::*[(name() = '{}') and (position() = 1)]".format(right.element)
|
||
)
|
||
return xpath
|
||
|
||
def xpath_relation_indirect_adjacent_combinator(
|
||
self, left: XPathExpr, right: XPathExpr
|
||
) -> XPathExpr:
|
||
"""right is a sibling after left, immediately or not; select left"""
|
||
return left.join("[following-sibling::", right, closing_combiner="]")
|
||
|
||
# Function: dispatch by function/pseudo-class name
|
||
|
||
def xpath_nth_child_function(
|
||
self, xpath: XPathExpr, function: Function, last: bool = False, add_name_test: bool = True
|
||
) -> XPathExpr:
|
||
try:
|
||
a, b = parse_series(function.arguments)
|
||
except ValueError:
|
||
raise ExpressionError("Invalid series: '%r'" % function.arguments)
|
||
|
||
# From https://www.w3.org/TR/css3-selectors/#structural-pseudos:
|
||
#
|
||
# :nth-child(an+b)
|
||
# an+b-1 siblings before
|
||
#
|
||
# :nth-last-child(an+b)
|
||
# an+b-1 siblings after
|
||
#
|
||
# :nth-of-type(an+b)
|
||
# an+b-1 siblings with the same expanded element name before
|
||
#
|
||
# :nth-last-of-type(an+b)
|
||
# an+b-1 siblings with the same expanded element name after
|
||
#
|
||
# So,
|
||
# for :nth-child and :nth-of-type
|
||
#
|
||
# count(preceding-sibling::<nodetest>) = an+b-1
|
||
#
|
||
# for :nth-last-child and :nth-last-of-type
|
||
#
|
||
# count(following-sibling::<nodetest>) = an+b-1
|
||
#
|
||
# therefore,
|
||
# count(...) - (b-1) ≡ 0 (mod a)
|
||
#
|
||
# if a == 0:
|
||
# ~~~~~~~~~~
|
||
# count(...) = b-1
|
||
#
|
||
# if a < 0:
|
||
# ~~~~~~~~~
|
||
# count(...) - b +1 <= 0
|
||
# -> count(...) <= b-1
|
||
#
|
||
# if a > 0:
|
||
# ~~~~~~~~~
|
||
# count(...) - b +1 >= 0
|
||
# -> count(...) >= b-1
|
||
|
||
# work with b-1 instead
|
||
b_min_1 = b - 1
|
||
|
||
# early-exit condition 1:
|
||
# ~~~~~~~~~~~~~~~~~~~~~~~
|
||
# for a == 1, nth-*(an+b) means n+b-1 siblings before/after,
|
||
# and since n ∈ {0, 1, 2, ...}, if b-1<=0,
|
||
# there is always an "n" matching any number of siblings (maybe none)
|
||
if a == 1 and b_min_1 <= 0:
|
||
return xpath
|
||
|
||
# early-exit condition 2:
|
||
# ~~~~~~~~~~~~~~~~~~~~~~~
|
||
# an+b-1 siblings with a<0 and (b-1)<0 is not possible
|
||
if a < 0 and b_min_1 < 0:
|
||
return xpath.add_condition("0")
|
||
|
||
# `add_name_test` boolean is inverted and somewhat counter-intuitive:
|
||
#
|
||
# nth_of_type() calls nth_child(add_name_test=False)
|
||
if add_name_test:
|
||
nodetest = "*"
|
||
else:
|
||
nodetest = "%s" % xpath.element
|
||
|
||
# count siblings before or after the element
|
||
if not last:
|
||
siblings_count = "count(preceding-sibling::%s)" % nodetest
|
||
else:
|
||
siblings_count = "count(following-sibling::%s)" % nodetest
|
||
|
||
# special case of fixed position: nth-*(0n+b)
|
||
# if a == 0:
|
||
# ~~~~~~~~~~
|
||
# count(***-sibling::***) = b-1
|
||
if a == 0:
|
||
return xpath.add_condition("%s = %s" % (siblings_count, b_min_1))
|
||
|
||
expressions = []
|
||
|
||
if a > 0:
|
||
# siblings count, an+b-1, is always >= 0,
|
||
# so if a>0, and (b-1)<=0, an "n" exists to satisfy this,
|
||
# therefore, the predicate is only interesting if (b-1)>0
|
||
if b_min_1 > 0:
|
||
expressions.append("%s >= %s" % (siblings_count, b_min_1))
|
||
else:
|
||
# if a<0, and (b-1)<0, no "n" satisfies this,
|
||
# this is tested above as an early exist condition
|
||
# otherwise,
|
||
expressions.append("%s <= %s" % (siblings_count, b_min_1))
|
||
|
||
# operations modulo 1 or -1 are simpler, one only needs to verify:
|
||
#
|
||
# - either:
|
||
# count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc.,
|
||
# i.e. count(***-sibling::***) >= (b-1)
|
||
#
|
||
# - or:
|
||
# count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc.,
|
||
# i.e. count(***-sibling::***) <= (b-1)
|
||
# we we just did above.
|
||
#
|
||
if abs(a) != 1:
|
||
# count(***-sibling::***) - (b-1) ≡ 0 (mod a)
|
||
left = siblings_count
|
||
|
||
# apply "modulo a" on 2nd term, -(b-1),
|
||
# to simplify things like "(... +6) % -3",
|
||
# and also make it positive with |a|
|
||
b_neg = (-b_min_1) % abs(a)
|
||
|
||
if b_neg != 0:
|
||
b_neg_as_str = "+%s" % b_neg
|
||
left = "(%s %s)" % (left, b_neg_as_str)
|
||
|
||
expressions.append("%s mod %s = 0" % (left, a))
|
||
|
||
if len(expressions) > 1:
|
||
template = "(%s)"
|
||
else:
|
||
template = "%s"
|
||
xpath.add_condition(" and ".join(template % expression for expression in expressions))
|
||
return xpath
|
||
|
||
def xpath_nth_last_child_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
|
||
return self.xpath_nth_child_function(xpath, function, last=True)
|
||
|
||
def xpath_nth_of_type_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
|
||
if xpath.element == "*":
|
||
raise ExpressionError("*:nth-of-type() is not implemented")
|
||
return self.xpath_nth_child_function(xpath, function, add_name_test=False)
|
||
|
||
def xpath_nth_last_of_type_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
|
||
if xpath.element == "*":
|
||
raise ExpressionError("*:nth-of-type() is not implemented")
|
||
return self.xpath_nth_child_function(xpath, function, last=True, add_name_test=False)
|
||
|
||
def xpath_contains_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
|
||
# Defined there, removed in later drafts:
|
||
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
|
||
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
||
raise ExpressionError(
|
||
"Expected a single string or ident for :contains(), got %r" % function.arguments
|
||
)
|
||
value = typing.cast(str, function.arguments[0].value)
|
||
return xpath.add_condition("contains(., %s)" % self.xpath_literal(value))
|
||
|
||
def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
|
||
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
||
raise ExpressionError(
|
||
"Expected a single string or ident for :lang(), got %r" % function.arguments
|
||
)
|
||
value = typing.cast(str, function.arguments[0].value)
|
||
return xpath.add_condition("lang(%s)" % (self.xpath_literal(value)))
|
||
|
||
# Pseudo: dispatch by pseudo-class name
|
||
|
||
def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
return xpath.add_condition("not(parent::*)")
|
||
|
||
# CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div")
|
||
# Works only at the start of a selector
|
||
# Needed to get immediate children of a processed selector in Scrapy
|
||
# for product in response.css('.product'):
|
||
# description = product.css(':scope > div::text').get()
|
||
def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
return xpath.add_condition("1")
|
||
|
||
def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
return xpath.add_condition("count(preceding-sibling::*) = 0")
|
||
|
||
def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
return xpath.add_condition("count(following-sibling::*) = 0")
|
||
|
||
def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
if xpath.element == "*":
|
||
raise ExpressionError("*:first-of-type is not implemented")
|
||
return xpath.add_condition("count(preceding-sibling::%s) = 0" % xpath.element)
|
||
|
||
def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
if xpath.element == "*":
|
||
raise ExpressionError("*:last-of-type is not implemented")
|
||
return xpath.add_condition("count(following-sibling::%s) = 0" % xpath.element)
|
||
|
||
def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
return xpath.add_condition("count(parent::*/child::*) = 1")
|
||
|
||
def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
if xpath.element == "*":
|
||
raise ExpressionError("*:only-of-type is not implemented")
|
||
return xpath.add_condition("count(parent::*/child::%s) = 1" % xpath.element)
|
||
|
||
def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr:
|
||
return xpath.add_condition("not(*) and not(string-length())")
|
||
|
||
def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr:
|
||
"""Common implementation for pseudo-classes that never match."""
|
||
return xpath.add_condition("0")
|
||
|
||
xpath_link_pseudo = pseudo_never_matches
|
||
xpath_visited_pseudo = pseudo_never_matches
|
||
xpath_hover_pseudo = pseudo_never_matches
|
||
xpath_active_pseudo = pseudo_never_matches
|
||
xpath_focus_pseudo = pseudo_never_matches
|
||
xpath_target_pseudo = pseudo_never_matches
|
||
xpath_enabled_pseudo = pseudo_never_matches
|
||
xpath_disabled_pseudo = pseudo_never_matches
|
||
xpath_checked_pseudo = pseudo_never_matches
|
||
|
||
# Attrib: dispatch by attribute operator
|
||
|
||
def xpath_attrib_exists(self, xpath: XPathExpr, name: str, value: Optional[str]) -> XPathExpr:
|
||
assert not value
|
||
xpath.add_condition(name)
|
||
return xpath
|
||
|
||
def xpath_attrib_equals(self, xpath: XPathExpr, name: str, value: Optional[str]) -> XPathExpr:
|
||
assert value is not None
|
||
xpath.add_condition("%s = %s" % (name, self.xpath_literal(value)))
|
||
return xpath
|
||
|
||
def xpath_attrib_different(
|
||
self, xpath: XPathExpr, name: str, value: Optional[str]
|
||
) -> XPathExpr:
|
||
assert value is not None
|
||
# FIXME: this seems like a weird hack...
|
||
if value:
|
||
xpath.add_condition("not(%s) or %s != %s" % (name, name, self.xpath_literal(value)))
|
||
else:
|
||
xpath.add_condition("%s != %s" % (name, self.xpath_literal(value)))
|
||
return xpath
|
||
|
||
def xpath_attrib_includes(
|
||
self, xpath: XPathExpr, name: str, value: Optional[str]
|
||
) -> XPathExpr:
|
||
if value and is_non_whitespace(value):
|
||
xpath.add_condition(
|
||
"%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
|
||
% (name, name, self.xpath_literal(" " + value + " "))
|
||
)
|
||
else:
|
||
xpath.add_condition("0")
|
||
return xpath
|
||
|
||
def xpath_attrib_dashmatch(
|
||
self, xpath: XPathExpr, name: str, value: Optional[str]
|
||
) -> XPathExpr:
|
||
assert value is not None
|
||
# Weird, but true...
|
||
xpath.add_condition(
|
||
"%s and (%s = %s or starts-with(%s, %s))"
|
||
% (name, name, self.xpath_literal(value), name, self.xpath_literal(value + "-"))
|
||
)
|
||
return xpath
|
||
|
||
def xpath_attrib_prefixmatch(
|
||
self, xpath: XPathExpr, name: str, value: Optional[str]
|
||
) -> XPathExpr:
|
||
if value:
|
||
xpath.add_condition(
|
||
"%s and starts-with(%s, %s)" % (name, name, self.xpath_literal(value))
|
||
)
|
||
else:
|
||
xpath.add_condition("0")
|
||
return xpath
|
||
|
||
def xpath_attrib_suffixmatch(
|
||
self, xpath: XPathExpr, name: str, value: Optional[str]
|
||
) -> XPathExpr:
|
||
if value:
|
||
# Oddly there is a starts-with in XPath 1.0, but not ends-with
|
||
xpath.add_condition(
|
||
"%s and substring(%s, string-length(%s)-%s) = %s"
|
||
% (name, name, name, len(value) - 1, self.xpath_literal(value))
|
||
)
|
||
else:
|
||
xpath.add_condition("0")
|
||
return xpath
|
||
|
||
def xpath_attrib_substringmatch(
|
||
self, xpath: XPathExpr, name: str, value: Optional[str]
|
||
) -> XPathExpr:
|
||
if value:
|
||
# Attribute selectors are case sensitive
|
||
xpath.add_condition(
|
||
"%s and contains(%s, %s)" % (name, name, self.xpath_literal(value))
|
||
)
|
||
else:
|
||
xpath.add_condition("0")
|
||
return xpath
|
||
|
||
|
||
class HTMLTranslator(GenericTranslator):
|
||
"""
|
||
Translator for (X)HTML documents.
|
||
|
||
Has a more useful implementation of some pseudo-classes based on
|
||
HTML-specific element names and attribute names, as described in
|
||
the `HTML5 specification`_. It assumes no-quirks mode.
|
||
The API is the same as :class:`GenericTranslator`.
|
||
|
||
.. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors
|
||
|
||
:param xhtml:
|
||
If false (the default), element names and attribute names
|
||
are case-insensitive.
|
||
|
||
"""
|
||
|
||
lang_attribute = "lang"
|
||
|
||
def __init__(self, xhtml: bool = False) -> None:
|
||
self.xhtml = xhtml # Might be useful for sub-classes?
|
||
if not xhtml:
|
||
# See their definition in GenericTranslator.
|
||
self.lower_case_element_names = True
|
||
self.lower_case_attribute_names = True
|
||
|
||
def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore
|
||
# FIXME: is this really all the elements?
|
||
return xpath.add_condition(
|
||
"(@selected and name(.) = 'option') or "
|
||
"(@checked "
|
||
"and (name(.) = 'input' or name(.) = 'command')"
|
||
"and (@type = 'checkbox' or @type = 'radio'))"
|
||
)
|
||
|
||
def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr:
|
||
if function.argument_types() not in (["STRING"], ["IDENT"]):
|
||
raise ExpressionError(
|
||
"Expected a single string or ident for :lang(), got %r" % function.arguments
|
||
)
|
||
value = function.arguments[0].value
|
||
assert value
|
||
return xpath.add_condition(
|
||
"ancestor-or-self::*[@lang][1][starts-with(concat("
|
||
# XPath 1.0 has no lower-case function...
|
||
"translate(@%s, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', "
|
||
"'abcdefghijklmnopqrstuvwxyz'), "
|
||
"'-'), %s)]" % (self.lang_attribute, self.xpath_literal(value.lower() + "-"))
|
||
)
|
||
|
||
def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore
|
||
return xpath.add_condition(
|
||
"@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')"
|
||
)
|
||
|
||
# Links are never visited, the implementation for :visited is the same
|
||
# as in GenericTranslator
|
||
|
||
def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore
|
||
# http://www.w3.org/TR/html5/section-index.html#attributes-1
|
||
return xpath.add_condition(
|
||
"""
|
||
(
|
||
@disabled and
|
||
(
|
||
(name(.) = 'input' and @type != 'hidden') or
|
||
name(.) = 'button' or
|
||
name(.) = 'select' or
|
||
name(.) = 'textarea' or
|
||
name(.) = 'command' or
|
||
name(.) = 'fieldset' or
|
||
name(.) = 'optgroup' or
|
||
name(.) = 'option'
|
||
)
|
||
) or (
|
||
(
|
||
(name(.) = 'input' and @type != 'hidden') or
|
||
name(.) = 'button' or
|
||
name(.) = 'select' or
|
||
name(.) = 'textarea'
|
||
)
|
||
and ancestor::fieldset[@disabled]
|
||
)
|
||
"""
|
||
)
|
||
# FIXME: in the second half, add "and is not a descendant of that
|
||
# fieldset element's first legend element child, if any."
|
||
|
||
def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore
|
||
# http://www.w3.org/TR/html5/section-index.html#attributes-1
|
||
return xpath.add_condition(
|
||
"""
|
||
(
|
||
@href and (
|
||
name(.) = 'a' or
|
||
name(.) = 'link' or
|
||
name(.) = 'area'
|
||
)
|
||
) or (
|
||
(
|
||
name(.) = 'command' or
|
||
name(.) = 'fieldset' or
|
||
name(.) = 'optgroup'
|
||
)
|
||
and not(@disabled)
|
||
) or (
|
||
(
|
||
(name(.) = 'input' and @type != 'hidden') or
|
||
name(.) = 'button' or
|
||
name(.) = 'select' or
|
||
name(.) = 'textarea' or
|
||
name(.) = 'keygen'
|
||
)
|
||
and not (@disabled or ancestor::fieldset[@disabled])
|
||
) or (
|
||
name(.) = 'option' and not(
|
||
@disabled or ancestor::optgroup[@disabled]
|
||
)
|
||
)
|
||
"""
|
||
)
|
||
# FIXME: ... or "li elements that are children of menu elements,
|
||
# and that have a child element that defines a command, if the first
|
||
# such element's Disabled State facet is false (not disabled)".
|
||
# FIXME: after ancestor::fieldset[@disabled], add "and is not a
|
||
# descendant of that fieldset element's first legend element child,
|
||
# if any."
|