Files
tmhr/venv/lib/python3.10/site-packages/pyquery/text.py
2024-02-23 10:30:02 +00:00

112 lines
3.3 KiB
Python

import re
# https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements
INLINE_TAGS = {
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite',
'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map',
'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong',
'sub', 'sup', 'textarea', 'time', 'tt', 'var'
}
SEPARATORS = {'br'}
# Definition of whitespace in HTML:
# https://www.w3.org/TR/html4/struct/text.html#h-9.1
WHITESPACE_RE = re.compile(u'[\x20\x09\x0C\u200B\x0A\x0D]+')
def squash_html_whitespace(text):
# use raw extract_text for preformatted content (like <pre> content or set
# by CSS rules)
# apply this function on top of
return WHITESPACE_RE.sub(' ', text)
def _squash_artifical_nl(parts):
output, last_nl = [], False
for x in parts:
if x is not None:
output.append(x)
last_nl = False
elif not last_nl:
output.append(None)
last_nl = True
return output
def _strip_artifical_nl(parts):
if not parts:
return parts
for start_idx, pt in enumerate(parts):
if isinstance(pt, str):
# 0, 1, 2, index of first string [start_idx:...
break
iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
for end_idx, pt in iterator:
if isinstance(pt, str): # 0=None, 1=-1, 2=-2, index of last string
break
return parts[start_idx:-end_idx if end_idx > 0 else None]
def _merge_original_parts(parts):
output, orp_buf = [], []
def flush():
if orp_buf:
item = squash_html_whitespace(''.join(orp_buf)).strip()
if item:
output.append(item)
orp_buf[:] = []
for x in parts:
if not isinstance(x, str):
flush()
output.append(x)
else:
orp_buf.append(x)
flush()
return output
def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
if callable(dom.tag):
return ''
r = []
if dom.tag in SEPARATORS:
r.append(True) # equivalent of '\n' used to designate separators
elif dom.tag not in INLINE_TAGS:
# equivalent of '\n' used to designate artificially inserted newlines
r.append(None)
if dom.text is not None:
r.append(dom.text)
for child in dom.getchildren():
r.extend(extract_text_array(child, squash_artifical_nl=False,
strip_artifical_nl=False))
if child.tail is not None:
r.append(child.tail)
if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
# equivalent of '\n' used to designate artificially inserted newlines
r.append(None)
if squash_artifical_nl:
r = _squash_artifical_nl(r)
if strip_artifical_nl:
r = _strip_artifical_nl(r)
return r
def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
a = extract_text_array(dom, squash_artifical_nl=squash_space)
if squash_space:
a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
result = ''.join(
block_symbol if x is None else (
sep_symbol if x is True else x
)
for x in a
)
if squash_space:
result = result.strip()
return result