first comit
This commit is contained in:
111
venv/lib/python3.10/site-packages/pyquery/text.py
Normal file
111
venv/lib/python3.10/site-packages/pyquery/text.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import re
|
||||
|
||||
|
||||
# https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements
|
||||
INLINE_TAGS = {
|
||||
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite',
|
||||
'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map',
|
||||
'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong',
|
||||
'sub', 'sup', 'textarea', 'time', 'tt', 'var'
|
||||
}
|
||||
|
||||
SEPARATORS = {'br'}
|
||||
|
||||
|
||||
# Definition of whitespace in HTML:
|
||||
# https://www.w3.org/TR/html4/struct/text.html#h-9.1
|
||||
WHITESPACE_RE = re.compile(u'[\x20\x09\x0C\u200B\x0A\x0D]+')
|
||||
|
||||
|
||||
def squash_html_whitespace(text):
|
||||
# use raw extract_text for preformatted content (like <pre> content or set
|
||||
# by CSS rules)
|
||||
# apply this function on top of
|
||||
return WHITESPACE_RE.sub(' ', text)
|
||||
|
||||
|
||||
def _squash_artifical_nl(parts):
|
||||
output, last_nl = [], False
|
||||
for x in parts:
|
||||
if x is not None:
|
||||
output.append(x)
|
||||
last_nl = False
|
||||
elif not last_nl:
|
||||
output.append(None)
|
||||
last_nl = True
|
||||
return output
|
||||
|
||||
|
||||
def _strip_artifical_nl(parts):
|
||||
if not parts:
|
||||
return parts
|
||||
for start_idx, pt in enumerate(parts):
|
||||
if isinstance(pt, str):
|
||||
# 0, 1, 2, index of first string [start_idx:...
|
||||
break
|
||||
iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
|
||||
for end_idx, pt in iterator:
|
||||
if isinstance(pt, str): # 0=None, 1=-1, 2=-2, index of last string
|
||||
break
|
||||
return parts[start_idx:-end_idx if end_idx > 0 else None]
|
||||
|
||||
|
||||
def _merge_original_parts(parts):
|
||||
output, orp_buf = [], []
|
||||
|
||||
def flush():
|
||||
if orp_buf:
|
||||
item = squash_html_whitespace(''.join(orp_buf)).strip()
|
||||
if item:
|
||||
output.append(item)
|
||||
orp_buf[:] = []
|
||||
|
||||
for x in parts:
|
||||
if not isinstance(x, str):
|
||||
flush()
|
||||
output.append(x)
|
||||
else:
|
||||
orp_buf.append(x)
|
||||
flush()
|
||||
return output
|
||||
|
||||
|
||||
def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
|
||||
if callable(dom.tag):
|
||||
return ''
|
||||
r = []
|
||||
if dom.tag in SEPARATORS:
|
||||
r.append(True) # equivalent of '\n' used to designate separators
|
||||
elif dom.tag not in INLINE_TAGS:
|
||||
# equivalent of '\n' used to designate artificially inserted newlines
|
||||
r.append(None)
|
||||
if dom.text is not None:
|
||||
r.append(dom.text)
|
||||
for child in dom.getchildren():
|
||||
r.extend(extract_text_array(child, squash_artifical_nl=False,
|
||||
strip_artifical_nl=False))
|
||||
if child.tail is not None:
|
||||
r.append(child.tail)
|
||||
if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
|
||||
# equivalent of '\n' used to designate artificially inserted newlines
|
||||
r.append(None)
|
||||
if squash_artifical_nl:
|
||||
r = _squash_artifical_nl(r)
|
||||
if strip_artifical_nl:
|
||||
r = _strip_artifical_nl(r)
|
||||
return r
|
||||
|
||||
|
||||
def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
|
||||
a = extract_text_array(dom, squash_artifical_nl=squash_space)
|
||||
if squash_space:
|
||||
a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
|
||||
result = ''.join(
|
||||
block_symbol if x is None else (
|
||||
sep_symbol if x is True else x
|
||||
)
|
||||
for x in a
|
||||
)
|
||||
if squash_space:
|
||||
result = result.strip()
|
||||
return result
|
||||
Reference in New Issue
Block a user