first comit
This commit is contained in:
5
venv/lib/python3.10/site-packages/pyquery/__init__.py
Normal file
5
venv/lib/python3.10/site-packages/pyquery/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
|
||||
#
|
||||
# Distributed under the BSD license, see LICENSE.txt
|
||||
|
||||
from .pyquery import PyQuery # NOQA
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
469
venv/lib/python3.10/site-packages/pyquery/cssselectpatch.py
Normal file
469
venv/lib/python3.10/site-packages/pyquery/cssselectpatch.py
Normal file
@@ -0,0 +1,469 @@
|
||||
# Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
|
||||
#
|
||||
# Distributed under the BSD license, see LICENSE.txt
|
||||
from __future__ import unicode_literals
|
||||
from cssselect import xpath as cssselect_xpath
|
||||
from cssselect.xpath import ExpressionError
|
||||
|
||||
XPathExprOrig = cssselect_xpath.XPathExpr
|
||||
|
||||
|
||||
class XPathExpr(XPathExprOrig):
|
||||
|
||||
def __init__(self, path='', element='*', condition='', star_prefix=False):
|
||||
self.path = path
|
||||
self.element = element
|
||||
self.condition = condition
|
||||
self.post_condition = None
|
||||
|
||||
def add_post_condition(self, post_condition):
|
||||
if self.post_condition:
|
||||
self.post_condition = '%s and (%s)' % (self.post_condition,
|
||||
post_condition)
|
||||
else:
|
||||
self.post_condition = post_condition
|
||||
|
||||
def __str__(self):
|
||||
path = XPathExprOrig.__str__(self)
|
||||
if self.post_condition:
|
||||
path = '%s[%s]' % (path, self.post_condition)
|
||||
return path
|
||||
|
||||
def join(self, combiner, other,
|
||||
closing_combiner=None, has_inner_condition=False):
|
||||
res = XPathExprOrig.join(self, combiner, other,
|
||||
closing_combiner=closing_combiner,
|
||||
has_inner_condition=has_inner_condition)
|
||||
self.post_condition = other.post_condition
|
||||
return res
|
||||
|
||||
|
||||
# keep cssselect < 0.8 compat for now
|
||||
|
||||
|
||||
class JQueryTranslator(cssselect_xpath.HTMLTranslator):
|
||||
"""This class is used to implement the css pseudo classes
|
||||
(:first, :last, ...) that are not defined in the css standard,
|
||||
but are defined in the jquery API.
|
||||
"""
|
||||
|
||||
xpathexpr_cls = XPathExpr
|
||||
|
||||
def xpath_first_pseudo(self, xpath):
|
||||
"""Matches the first selected element::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><p class="first"></p><p></p></div>')
|
||||
>>> d('p:first')
|
||||
[<p.first>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_post_condition('position() = 1')
|
||||
return xpath
|
||||
|
||||
def xpath_last_pseudo(self, xpath):
|
||||
"""Matches the last selected element::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><p></p><p class="last"></p></div>')
|
||||
>>> d('p:last')
|
||||
[<p.last>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_post_condition('position() = last()')
|
||||
return xpath
|
||||
|
||||
def xpath_even_pseudo(self, xpath):
|
||||
"""Matches even elements, zero-indexed::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><p></p><p class="last"></p></div>')
|
||||
>>> d('p:even')
|
||||
[<p>]
|
||||
|
||||
..
|
||||
"""
|
||||
# the first element is 1 in xpath and 0 in python and js
|
||||
xpath.add_post_condition('position() mod 2 = 1')
|
||||
return xpath
|
||||
|
||||
def xpath_odd_pseudo(self, xpath):
|
||||
"""Matches odd elements, zero-indexed::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><p></p><p class="last"></p></div>')
|
||||
>>> d('p:odd')
|
||||
[<p.last>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_post_condition('position() mod 2 = 0')
|
||||
return xpath
|
||||
|
||||
def xpath_checked_pseudo(self, xpath):
|
||||
"""Matches odd elements, zero-indexed::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input checked="checked"/></div>')
|
||||
>>> d('input:checked')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@checked and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_selected_pseudo(self, xpath):
|
||||
"""Matches all elements that are selected::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<select><option selected="selected"/></select>')
|
||||
>>> d('option:selected')
|
||||
[<option>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@selected and name(.) = 'option'")
|
||||
return xpath
|
||||
|
||||
def _format_disabled_xpath(self, disabled=True):
|
||||
"""Format XPath condition for :disabled or :enabled pseudo-classes
|
||||
according to the WHATWG spec. See: https://html.spec.whatwg.org
|
||||
/multipage/semantics-other.html#concept-element-disabled
|
||||
"""
|
||||
bool_op = '' if disabled else 'not'
|
||||
return '''(
|
||||
((name(.) = 'button' or name(.) = 'input' or name(.) = 'select'
|
||||
or name(.) = 'textarea' or name(.) = 'fieldset')
|
||||
and %s(@disabled or (ancestor::fieldset[@disabled]
|
||||
and not(ancestor::legend[not(preceding-sibling::legend)])))
|
||||
)
|
||||
or
|
||||
((name(.) = 'option'
|
||||
and %s(@disabled or ancestor::optgroup[@disabled]))
|
||||
)
|
||||
or
|
||||
((name(.) = 'optgroup' and %s(@disabled)))
|
||||
)''' % (bool_op, bool_op, bool_op)
|
||||
|
||||
def xpath_disabled_pseudo(self, xpath):
|
||||
"""Matches all elements that are disabled::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input disabled="disabled"/></div>')
|
||||
>>> d('input:disabled')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition(self._format_disabled_xpath())
|
||||
return xpath
|
||||
|
||||
def xpath_enabled_pseudo(self, xpath):
|
||||
"""Matches all elements that are enabled::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input value="foo" /></div>')
|
||||
>>> d('input:enabled')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition(self._format_disabled_xpath(disabled=False))
|
||||
return xpath
|
||||
|
||||
def xpath_file_pseudo(self, xpath):
|
||||
"""Matches all input elements of type file::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="file"/></div>')
|
||||
>>> d('input:file')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'file' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_input_pseudo(self, xpath):
|
||||
"""Matches all input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery(('<div><input type="file"/>'
|
||||
... '<textarea></textarea></div>'))
|
||||
>>> d(':input')
|
||||
[<input>, <textarea>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition((
|
||||
"(name(.) = 'input' or name(.) = 'select') "
|
||||
"or (name(.) = 'textarea' or name(.) = 'button')"))
|
||||
return xpath
|
||||
|
||||
def xpath_button_pseudo(self, xpath):
|
||||
"""Matches all button input elements and the button element::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery(('<div><input type="button"/>'
|
||||
... '<button></button></div>'))
|
||||
>>> d(':button')
|
||||
[<input>, <button>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition((
|
||||
"(@type = 'button' and name(.) = 'input') "
|
||||
"or name(.) = 'button'"))
|
||||
return xpath
|
||||
|
||||
def xpath_radio_pseudo(self, xpath):
|
||||
"""Matches all radio input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="radio"/></div>')
|
||||
>>> d('input:radio')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'radio' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_text_pseudo(self, xpath):
|
||||
"""Matches all text input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="text"/></div>')
|
||||
>>> d('input:text')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'text' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_checkbox_pseudo(self, xpath):
|
||||
"""Matches all checkbox input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="checkbox"/></div>')
|
||||
>>> d('input:checkbox')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'checkbox' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_password_pseudo(self, xpath):
|
||||
"""Matches all password input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="password"/></div>')
|
||||
>>> d('input:password')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'password' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_submit_pseudo(self, xpath):
|
||||
"""Matches all submit input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="submit"/></div>')
|
||||
>>> d('input:submit')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'submit' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_hidden_pseudo(self, xpath):
|
||||
"""Matches all hidden input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="hidden"/></div>')
|
||||
>>> d('input:hidden')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'hidden' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_image_pseudo(self, xpath):
|
||||
"""Matches all image input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="image"/></div>')
|
||||
>>> d('input:image')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'image' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_reset_pseudo(self, xpath):
|
||||
"""Matches all reset input elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><input type="reset"/></div>')
|
||||
>>> d('input:reset')
|
||||
[<input>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("@type = 'reset' and name(.) = 'input'")
|
||||
return xpath
|
||||
|
||||
def xpath_header_pseudo(self, xpath):
|
||||
"""Matches all header elements (h1, ..., h6)::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1>title</h1></div>')
|
||||
>>> d(':header')
|
||||
[<h1>]
|
||||
|
||||
..
|
||||
"""
|
||||
# this seems kind of brute-force, is there a better way?
|
||||
xpath.add_condition((
|
||||
"(name(.) = 'h1' or name(.) = 'h2' or name (.) = 'h3') "
|
||||
"or (name(.) = 'h4' or name (.) = 'h5' or name(.) = 'h6')"))
|
||||
return xpath
|
||||
|
||||
def xpath_parent_pseudo(self, xpath):
|
||||
"""Match all elements that contain other elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1><span>title</span></h1><h1/></div>')
|
||||
>>> d('h1:parent')
|
||||
[<h1>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("count(child::*) > 0")
|
||||
return xpath
|
||||
|
||||
def xpath_empty_pseudo(self, xpath):
|
||||
"""Match all elements that do not contain other elements::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1><span>title</span></h1><h2/></div>')
|
||||
>>> d(':empty')
|
||||
[<h2>]
|
||||
|
||||
..
|
||||
"""
|
||||
xpath.add_condition("not(node())")
|
||||
return xpath
|
||||
|
||||
def xpath_eq_function(self, xpath, function):
|
||||
"""Matches a single element by its index::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
|
||||
>>> d('h1:eq(0)')
|
||||
[<h1.first>]
|
||||
>>> d('h1:eq(1)')
|
||||
[<h1.last>]
|
||||
|
||||
..
|
||||
"""
|
||||
if function.argument_types() != ['NUMBER']:
|
||||
raise ExpressionError(
|
||||
"Expected a single integer for :eq(), got %r" % (
|
||||
function.arguments,))
|
||||
value = int(function.arguments[0].value)
|
||||
xpath.add_post_condition('position() = %s' % (value + 1))
|
||||
return xpath
|
||||
|
||||
def xpath_gt_function(self, xpath, function):
|
||||
"""Matches all elements with an index over the given one::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
|
||||
>>> d('h1:gt(0)')
|
||||
[<h1.last>]
|
||||
|
||||
..
|
||||
"""
|
||||
if function.argument_types() != ['NUMBER']:
|
||||
raise ExpressionError(
|
||||
"Expected a single integer for :gt(), got %r" % (
|
||||
function.arguments,))
|
||||
value = int(function.arguments[0].value)
|
||||
xpath.add_post_condition('position() > %s' % (value + 1))
|
||||
return xpath
|
||||
|
||||
def xpath_lt_function(self, xpath, function):
|
||||
"""Matches all elements with an index below the given one::
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1 class="first"/><h1 class="last"/></div>')
|
||||
>>> d('h1:lt(1)')
|
||||
[<h1.first>]
|
||||
|
||||
..
|
||||
"""
|
||||
if function.argument_types() != ['NUMBER']:
|
||||
raise ExpressionError(
|
||||
"Expected a single integer for :gt(), got %r" % (
|
||||
function.arguments,))
|
||||
|
||||
value = int(function.arguments[0].value)
|
||||
xpath.add_post_condition('position() < %s' % (value + 1))
|
||||
return xpath
|
||||
|
||||
def xpath_contains_function(self, xpath, function):
|
||||
"""Matches all elements that contain the given text
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div><h1/><h1 class="title">title</h1></div>')
|
||||
>>> d('h1:contains("title")')
|
||||
[<h1.title>]
|
||||
|
||||
..
|
||||
"""
|
||||
if function.argument_types() not in (['STRING'], ['IDENT']):
|
||||
raise ExpressionError(
|
||||
"Expected a single string or ident for :contains(), got %r" % (
|
||||
function.arguments,))
|
||||
|
||||
value = self.xpath_literal(function.arguments[0].value)
|
||||
xpath.add_post_condition('contains(., %s)' % value)
|
||||
return xpath
|
||||
|
||||
def xpath_has_function(self, xpath, function):
|
||||
"""Matches elements which contain at least one element that matches
|
||||
the specified selector. https://api.jquery.com/has-selector/
|
||||
|
||||
>>> from pyquery import PyQuery
|
||||
>>> d = PyQuery('<div class="foo"><div class="bar"></div></div>')
|
||||
>>> d('.foo:has(".baz")')
|
||||
[]
|
||||
>>> d('.foo:has(".foo")')
|
||||
[]
|
||||
>>> d('.foo:has(".bar")')
|
||||
[<div.foo>]
|
||||
>>> d('.foo:has(div)')
|
||||
[<div.foo>]
|
||||
|
||||
..
|
||||
"""
|
||||
if function.argument_types() not in (['STRING'], ['IDENT']):
|
||||
raise ExpressionError(
|
||||
"Expected a single string or ident for :has(), got %r" % (
|
||||
function.arguments,))
|
||||
value = self.css_to_xpath(
|
||||
function.arguments[0].value, prefix='descendant::',
|
||||
)
|
||||
xpath.add_post_condition(value)
|
||||
return xpath
|
||||
77
venv/lib/python3.10/site-packages/pyquery/openers.py
Normal file
77
venv/lib/python3.10/site-packages/pyquery/openers.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from urllib.request import urlopen
|
||||
from urllib.parse import urlencode
|
||||
from urllib.error import HTTPError
|
||||
|
||||
try:
|
||||
import requests
|
||||
HAS_REQUEST = True
|
||||
except ImportError:
|
||||
HAS_REQUEST = False
|
||||
|
||||
DEFAULT_TIMEOUT = 60
|
||||
|
||||
basestring = (str, bytes)
|
||||
|
||||
allowed_args = (
|
||||
'auth', 'data', 'headers', 'verify',
|
||||
'cert', 'config', 'hooks', 'proxies', 'cookies'
|
||||
)
|
||||
|
||||
|
||||
def _query(url, method, kwargs):
|
||||
data = None
|
||||
if 'data' in kwargs:
|
||||
data = kwargs.pop('data')
|
||||
if type(data) in (dict, list, tuple):
|
||||
data = urlencode(data)
|
||||
|
||||
if isinstance(method, basestring) and \
|
||||
method.lower() == 'get' and data:
|
||||
if '?' not in url:
|
||||
url += '?'
|
||||
elif url[-1] not in ('?', '&'):
|
||||
url += '&'
|
||||
url += data
|
||||
data = None
|
||||
|
||||
if data:
|
||||
data = data.encode('utf-8')
|
||||
return url, data
|
||||
|
||||
|
||||
def _requests(url, kwargs):
|
||||
|
||||
encoding = kwargs.get('encoding')
|
||||
method = kwargs.get('method', 'get').lower()
|
||||
session = kwargs.get('session')
|
||||
if session:
|
||||
meth = getattr(session, str(method))
|
||||
else:
|
||||
meth = getattr(requests, str(method))
|
||||
if method == 'get':
|
||||
url, data = _query(url, method, kwargs)
|
||||
kw = {}
|
||||
for k in allowed_args:
|
||||
if k in kwargs:
|
||||
kw[k] = kwargs[k]
|
||||
resp = meth(url=url, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT), **kw)
|
||||
if not (200 <= resp.status_code < 300):
|
||||
raise HTTPError(resp.url, resp.status_code,
|
||||
resp.reason, resp.headers, None)
|
||||
if encoding:
|
||||
resp.encoding = encoding
|
||||
html = resp.text
|
||||
return html
|
||||
|
||||
|
||||
def _urllib(url, kwargs):
|
||||
method = kwargs.get('method')
|
||||
url, data = _query(url, method, kwargs)
|
||||
return urlopen(url, data, timeout=kwargs.get('timeout', DEFAULT_TIMEOUT))
|
||||
|
||||
|
||||
def url_opener(url, kwargs):
|
||||
if HAS_REQUEST:
|
||||
return _requests(url, kwargs)
|
||||
return _urllib(url, kwargs)
|
||||
1671
venv/lib/python3.10/site-packages/pyquery/pyquery.py
Normal file
1671
venv/lib/python3.10/site-packages/pyquery/pyquery.py
Normal file
File diff suppressed because it is too large
Load Diff
111
venv/lib/python3.10/site-packages/pyquery/text.py
Normal file
111
venv/lib/python3.10/site-packages/pyquery/text.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import re
|
||||
|
||||
|
||||
# https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements
|
||||
INLINE_TAGS = {
|
||||
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite',
|
||||
'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map',
|
||||
'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong',
|
||||
'sub', 'sup', 'textarea', 'time', 'tt', 'var'
|
||||
}
|
||||
|
||||
SEPARATORS = {'br'}
|
||||
|
||||
|
||||
# Definition of whitespace in HTML:
|
||||
# https://www.w3.org/TR/html4/struct/text.html#h-9.1
|
||||
WHITESPACE_RE = re.compile(u'[\x20\x09\x0C\u200B\x0A\x0D]+')
|
||||
|
||||
|
||||
def squash_html_whitespace(text):
|
||||
# use raw extract_text for preformatted content (like <pre> content or set
|
||||
# by CSS rules)
|
||||
# apply this function on top of
|
||||
return WHITESPACE_RE.sub(' ', text)
|
||||
|
||||
|
||||
def _squash_artifical_nl(parts):
|
||||
output, last_nl = [], False
|
||||
for x in parts:
|
||||
if x is not None:
|
||||
output.append(x)
|
||||
last_nl = False
|
||||
elif not last_nl:
|
||||
output.append(None)
|
||||
last_nl = True
|
||||
return output
|
||||
|
||||
|
||||
def _strip_artifical_nl(parts):
|
||||
if not parts:
|
||||
return parts
|
||||
for start_idx, pt in enumerate(parts):
|
||||
if isinstance(pt, str):
|
||||
# 0, 1, 2, index of first string [start_idx:...
|
||||
break
|
||||
iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
|
||||
for end_idx, pt in iterator:
|
||||
if isinstance(pt, str): # 0=None, 1=-1, 2=-2, index of last string
|
||||
break
|
||||
return parts[start_idx:-end_idx if end_idx > 0 else None]
|
||||
|
||||
|
||||
def _merge_original_parts(parts):
|
||||
output, orp_buf = [], []
|
||||
|
||||
def flush():
|
||||
if orp_buf:
|
||||
item = squash_html_whitespace(''.join(orp_buf)).strip()
|
||||
if item:
|
||||
output.append(item)
|
||||
orp_buf[:] = []
|
||||
|
||||
for x in parts:
|
||||
if not isinstance(x, str):
|
||||
flush()
|
||||
output.append(x)
|
||||
else:
|
||||
orp_buf.append(x)
|
||||
flush()
|
||||
return output
|
||||
|
||||
|
||||
def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
|
||||
if callable(dom.tag):
|
||||
return ''
|
||||
r = []
|
||||
if dom.tag in SEPARATORS:
|
||||
r.append(True) # equivalent of '\n' used to designate separators
|
||||
elif dom.tag not in INLINE_TAGS:
|
||||
# equivalent of '\n' used to designate artificially inserted newlines
|
||||
r.append(None)
|
||||
if dom.text is not None:
|
||||
r.append(dom.text)
|
||||
for child in dom.getchildren():
|
||||
r.extend(extract_text_array(child, squash_artifical_nl=False,
|
||||
strip_artifical_nl=False))
|
||||
if child.tail is not None:
|
||||
r.append(child.tail)
|
||||
if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
|
||||
# equivalent of '\n' used to designate artificially inserted newlines
|
||||
r.append(None)
|
||||
if squash_artifical_nl:
|
||||
r = _squash_artifical_nl(r)
|
||||
if strip_artifical_nl:
|
||||
r = _strip_artifical_nl(r)
|
||||
return r
|
||||
|
||||
|
||||
def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
|
||||
a = extract_text_array(dom, squash_artifical_nl=squash_space)
|
||||
if squash_space:
|
||||
a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
|
||||
result = ''.join(
|
||||
block_symbol if x is None else (
|
||||
sep_symbol if x is True else x
|
||||
)
|
||||
for x in a
|
||||
)
|
||||
if squash_space:
|
||||
result = result.strip()
|
||||
return result
|
||||
Reference in New Issue
Block a user