first comit

This commit is contained in:
2024-02-23 10:30:02 +00:00
commit ddeb07d0ba
12482 changed files with 1857507 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
# Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
#
# Distributed under the BSD license, see LICENSE.txt
from .pyquery import PyQuery # NOQA

View File

@@ -0,0 +1,469 @@
# Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
#
# Distributed under the BSD license, see LICENSE.txt
from __future__ import unicode_literals
from cssselect import xpath as cssselect_xpath
from cssselect.xpath import ExpressionError
XPathExprOrig = cssselect_xpath.XPathExpr
class XPathExpr(XPathExprOrig):
def __init__(self, path='', element='*', condition='', star_prefix=False):
self.path = path
self.element = element
self.condition = condition
self.post_condition = None
def add_post_condition(self, post_condition):
if self.post_condition:
self.post_condition = '%s and (%s)' % (self.post_condition,
post_condition)
else:
self.post_condition = post_condition
def __str__(self):
path = XPathExprOrig.__str__(self)
if self.post_condition:
path = '%s[%s]' % (path, self.post_condition)
return path
def join(self, combiner, other,
closing_combiner=None, has_inner_condition=False):
res = XPathExprOrig.join(self, combiner, other,
closing_combiner=closing_combiner,
has_inner_condition=has_inner_condition)
self.post_condition = other.post_condition
return res
# keep cssselect < 0.8 compat for now
class JQueryTranslator(cssselect_xpath.HTMLTranslator):
"""This class is used to implement the css pseudo classes
(:first, :last, ...) that are not defined in the css standard,
but are defined in the jquery API.
"""
xpathexpr_cls = XPathExpr
def xpath_first_pseudo(self, xpath):
"""Matches the first selected element::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><p class="first"></p><p></p></div>')
>>> d('p:first')
[<p.first>]
..
"""
xpath.add_post_condition('position() = 1')
return xpath
def xpath_last_pseudo(self, xpath):
"""Matches the last selected element::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><p></p><p class="last"></p></div>')
>>> d('p:last')
[<p.last>]
..
"""
xpath.add_post_condition('position() = last()')
return xpath
def xpath_even_pseudo(self, xpath):
"""Matches even elements, zero-indexed::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><p></p><p class="last"></p></div>')
>>> d('p:even')
[<p>]
..
"""
# the first element is 1 in xpath and 0 in python and js
xpath.add_post_condition('position() mod 2 = 1')
return xpath
def xpath_odd_pseudo(self, xpath):
"""Matches odd elements, zero-indexed::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><p></p><p class="last"></p></div>')
>>> d('p:odd')
[<p.last>]
..
"""
xpath.add_post_condition('position() mod 2 = 0')
return xpath
def xpath_checked_pseudo(self, xpath):
"""Matches odd elements, zero-indexed::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input checked="checked"/></div>')
>>> d('input:checked')
[<input>]
..
"""
xpath.add_condition("@checked and name(.) = 'input'")
return xpath
def xpath_selected_pseudo(self, xpath):
"""Matches all elements that are selected::
>>> from pyquery import PyQuery
>>> d = PyQuery('<select><option selected="selected"/></select>')
>>> d('option:selected')
[<option>]
..
"""
xpath.add_condition("@selected and name(.) = 'option'")
return xpath
def _format_disabled_xpath(self, disabled=True):
"""Format XPath condition for :disabled or :enabled pseudo-classes
according to the WHATWG spec. See: https://html.spec.whatwg.org
/multipage/semantics-other.html#concept-element-disabled
"""
bool_op = '' if disabled else 'not'
return '''(
((name(.) = 'button' or name(.) = 'input' or name(.) = 'select'
or name(.) = 'textarea' or name(.) = 'fieldset')
and %s(@disabled or (ancestor::fieldset[@disabled]
and not(ancestor::legend[not(preceding-sibling::legend)])))
)
or
((name(.) = 'option'
and %s(@disabled or ancestor::optgroup[@disabled]))
)
or
((name(.) = 'optgroup' and %s(@disabled)))
)''' % (bool_op, bool_op, bool_op)
def xpath_disabled_pseudo(self, xpath):
"""Matches all elements that are disabled::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input disabled="disabled"/></div>')
>>> d('input:disabled')
[<input>]
..
"""
xpath.add_condition(self._format_disabled_xpath())
return xpath
def xpath_enabled_pseudo(self, xpath):
"""Matches all elements that are enabled::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input value="foo" /></div>')
>>> d('input:enabled')
[<input>]
..
"""
xpath.add_condition(self._format_disabled_xpath(disabled=False))
return xpath
def xpath_file_pseudo(self, xpath):
"""Matches all input elements of type file::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="file"/></div>')
>>> d('input:file')
[<input>]
..
"""
xpath.add_condition("@type = 'file' and name(.) = 'input'")
return xpath
def xpath_input_pseudo(self, xpath):
"""Matches all input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery(('<div><input type="file"/>'
... '<textarea></textarea></div>'))
>>> d(':input')
[<input>, <textarea>]
..
"""
xpath.add_condition((
"(name(.) = 'input' or name(.) = 'select') "
"or (name(.) = 'textarea' or name(.) = 'button')"))
return xpath
def xpath_button_pseudo(self, xpath):
"""Matches all button input elements and the button element::
>>> from pyquery import PyQuery
>>> d = PyQuery(('<div><input type="button"/>'
... '<button></button></div>'))
>>> d(':button')
[<input>, <button>]
..
"""
xpath.add_condition((
"(@type = 'button' and name(.) = 'input') "
"or name(.) = 'button'"))
return xpath
def xpath_radio_pseudo(self, xpath):
"""Matches all radio input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="radio"/></div>')
>>> d('input:radio')
[<input>]
..
"""
xpath.add_condition("@type = 'radio' and name(.) = 'input'")
return xpath
def xpath_text_pseudo(self, xpath):
"""Matches all text input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="text"/></div>')
>>> d('input:text')
[<input>]
..
"""
xpath.add_condition("@type = 'text' and name(.) = 'input'")
return xpath
def xpath_checkbox_pseudo(self, xpath):
"""Matches all checkbox input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="checkbox"/></div>')
>>> d('input:checkbox')
[<input>]
..
"""
xpath.add_condition("@type = 'checkbox' and name(.) = 'input'")
return xpath
def xpath_password_pseudo(self, xpath):
"""Matches all password input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="password"/></div>')
>>> d('input:password')
[<input>]
..
"""
xpath.add_condition("@type = 'password' and name(.) = 'input'")
return xpath
def xpath_submit_pseudo(self, xpath):
"""Matches all submit input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="submit"/></div>')
>>> d('input:submit')
[<input>]
..
"""
xpath.add_condition("@type = 'submit' and name(.) = 'input'")
return xpath
def xpath_hidden_pseudo(self, xpath):
"""Matches all hidden input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="hidden"/></div>')
>>> d('input:hidden')
[<input>]
..
"""
xpath.add_condition("@type = 'hidden' and name(.) = 'input'")
return xpath
def xpath_image_pseudo(self, xpath):
"""Matches all image input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="image"/></div>')
>>> d('input:image')
[<input>]
..
"""
xpath.add_condition("@type = 'image' and name(.) = 'input'")
return xpath
def xpath_reset_pseudo(self, xpath):
"""Matches all reset input elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><input type="reset"/></div>')
>>> d('input:reset')
[<input>]
..
"""
xpath.add_condition("@type = 'reset' and name(.) = 'input'")
return xpath
def xpath_header_pseudo(self, xpath):
"""Matches all header elements (h1, ..., h6)::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1>title</h1></div>')
>>> d(':header')
[<h1>]
..
"""
# this seems kind of brute-force, is there a better way?
xpath.add_condition((
"(name(.) = 'h1' or name(.) = 'h2' or name (.) = 'h3') "
"or (name(.) = 'h4' or name (.) = 'h5' or name(.) = 'h6')"))
return xpath
def xpath_parent_pseudo(self, xpath):
"""Match all elements that contain other elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1><span>title</span></h1><h1/></div>')
>>> d('h1:parent')
[<h1>]
..
"""
xpath.add_condition("count(child::*) > 0")
return xpath
def xpath_empty_pseudo(self, xpath):
"""Match all elements that do not contain other elements::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1><span>title</span></h1><h2/></div>')
>>> d(':empty')
[<h2>]
..
"""
xpath.add_condition("not(node())")
return xpath
def xpath_eq_function(self, xpath, function):
"""Matches a single element by its index::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
>>> d('h1:eq(0)')
[<h1.first>]
>>> d('h1:eq(1)')
[<h1.last>]
..
"""
if function.argument_types() != ['NUMBER']:
raise ExpressionError(
"Expected a single integer for :eq(), got %r" % (
function.arguments,))
value = int(function.arguments[0].value)
xpath.add_post_condition('position() = %s' % (value + 1))
return xpath
def xpath_gt_function(self, xpath, function):
"""Matches all elements with an index over the given one::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
>>> d('h1:gt(0)')
[<h1.last>]
..
"""
if function.argument_types() != ['NUMBER']:
raise ExpressionError(
"Expected a single integer for :gt(), got %r" % (
function.arguments,))
value = int(function.arguments[0].value)
xpath.add_post_condition('position() > %s' % (value + 1))
return xpath
def xpath_lt_function(self, xpath, function):
"""Matches all elements with an index below the given one::
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
>>> d('h1:lt(1)')
[<h1.first>]
..
"""
if function.argument_types() != ['NUMBER']:
raise ExpressionError(
"Expected a single integer for :gt(), got %r" % (
function.arguments,))
value = int(function.arguments[0].value)
xpath.add_post_condition('position() < %s' % (value + 1))
return xpath
def xpath_contains_function(self, xpath, function):
"""Matches all elements that contain the given text
>>> from pyquery import PyQuery
>>> d = PyQuery('<div><h1/><h1 class="title">title</h1></div>')
>>> d('h1:contains("title")')
[<h1.title>]
..
"""
if function.argument_types() not in (['STRING'], ['IDENT']):
raise ExpressionError(
"Expected a single string or ident for :contains(), got %r" % (
function.arguments,))
value = self.xpath_literal(function.arguments[0].value)
xpath.add_post_condition('contains(., %s)' % value)
return xpath
def xpath_has_function(self, xpath, function):
"""Matches elements which contain at least one element that matches
the specified selector. https://api.jquery.com/has-selector/
>>> from pyquery import PyQuery
>>> d = PyQuery('<div class="foo"><div class="bar"></div></div>')
>>> d('.foo:has(".baz")')
[]
>>> d('.foo:has(".foo")')
[]
>>> d('.foo:has(".bar")')
[<div.foo>]
>>> d('.foo:has(div)')
[<div.foo>]
..
"""
if function.argument_types() not in (['STRING'], ['IDENT']):
raise ExpressionError(
"Expected a single string or ident for :has(), got %r" % (
function.arguments,))
value = self.css_to_xpath(
function.arguments[0].value, prefix='descendant::',
)
xpath.add_post_condition(value)
return xpath

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.error import HTTPError
try:
import requests
HAS_REQUEST = True
except ImportError:
HAS_REQUEST = False
DEFAULT_TIMEOUT = 60
basestring = (str, bytes)
allowed_args = (
'auth', 'data', 'headers', 'verify',
'cert', 'config', 'hooks', 'proxies', 'cookies'
)
def _query(url, method, kwargs):
data = None
if 'data' in kwargs:
data = kwargs.pop('data')
if type(data) in (dict, list, tuple):
data = urlencode(data)
if isinstance(method, basestring) and \
method.lower() == 'get' and data:
if '?' not in url:
url += '?'
elif url[-1] not in ('?', '&'):
url += '&'
url += data
data = None
if data:
data = data.encode('utf-8')
return url, data
def _requests(url, kwargs):
encoding = kwargs.get('encoding')
method = kwargs.get('method', 'get').lower()
session = kwargs.get('session')
if session:
meth = getattr(session, str(method))
else:
meth = getattr(requests, str(method))
if method == 'get':
url, data = _query(url, method, kwargs)
kw = {}
for k in allowed_args:
if k in kwargs:
kw[k] = kwargs[k]
resp = meth(url=url, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT), **kw)
if not (200 <= resp.status_code < 300):
raise HTTPError(resp.url, resp.status_code,
resp.reason, resp.headers, None)
if encoding:
resp.encoding = encoding
html = resp.text
return html
def _urllib(url, kwargs):
method = kwargs.get('method')
url, data = _query(url, method, kwargs)
return urlopen(url, data, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT))
def url_opener(url, kwargs):
if HAS_REQUEST:
return _requests(url, kwargs)
return _urllib(url, kwargs)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,111 @@
import re
# https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements
INLINE_TAGS = {
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite',
'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map',
'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong',
'sub', 'sup', 'textarea', 'time', 'tt', 'var'
}
SEPARATORS = {'br'}
# Definition of whitespace in HTML:
# https://www.w3.org/TR/html4/struct/text.html#h-9.1
WHITESPACE_RE = re.compile(u'[\x20\x09\x0C\u200B\x0A\x0D]+')
def squash_html_whitespace(text):
# use raw extract_text for preformatted content (like <pre> content or set
# by CSS rules)
# apply this function on top of
return WHITESPACE_RE.sub(' ', text)
def _squash_artifical_nl(parts):
output, last_nl = [], False
for x in parts:
if x is not None:
output.append(x)
last_nl = False
elif not last_nl:
output.append(None)
last_nl = True
return output
def _strip_artifical_nl(parts):
if not parts:
return parts
for start_idx, pt in enumerate(parts):
if isinstance(pt, str):
# 0, 1, 2, index of first string [start_idx:...
break
iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
for end_idx, pt in iterator:
if isinstance(pt, str): # 0=None, 1=-1, 2=-2, index of last string
break
return parts[start_idx:-end_idx if end_idx > 0 else None]
def _merge_original_parts(parts):
output, orp_buf = [], []
def flush():
if orp_buf:
item = squash_html_whitespace(''.join(orp_buf)).strip()
if item:
output.append(item)
orp_buf[:] = []
for x in parts:
if not isinstance(x, str):
flush()
output.append(x)
else:
orp_buf.append(x)
flush()
return output
def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
if callable(dom.tag):
return ''
r = []
if dom.tag in SEPARATORS:
r.append(True) # equivalent of '\n' used to designate separators
elif dom.tag not in INLINE_TAGS:
# equivalent of '\n' used to designate artificially inserted newlines
r.append(None)
if dom.text is not None:
r.append(dom.text)
for child in dom.getchildren():
r.extend(extract_text_array(child, squash_artifical_nl=False,
strip_artifical_nl=False))
if child.tail is not None:
r.append(child.tail)
if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
# equivalent of '\n' used to designate artificially inserted newlines
r.append(None)
if squash_artifical_nl:
r = _squash_artifical_nl(r)
if strip_artifical_nl:
r = _strip_artifical_nl(r)
return r
def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
a = extract_text_array(dom, squash_artifical_nl=squash_space)
if squash_space:
a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
result = ''.join(
block_symbol if x is None else (
sep_symbol if x is True else x
)
for x in a
)
if squash_space:
result = result.strip()
return result