tmhr/venv/lib/python3.10/site-packages/pyquery/text.py

import re


# https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#Elements
INLINE_TAGS = {
    'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite',
    'code', 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map',
    'object', 'q', 'samp', 'script', 'select', 'small', 'span', 'strong',
    'sub', 'sup', 'textarea', 'time', 'tt', 'var'
}

SEPARATORS = {'br'}


# Definition of whitespace in HTML:
# https://www.w3.org/TR/html4/struct/text.html#h-9.1
WHITESPACE_RE = re.compile(u'[\x20\x09\x0C\u200B\x0A\x0D]+')


def squash_html_whitespace(text):
    # use raw extract_text for preformatted content (like <pre> content or set
    # by CSS rules)
    # apply this function on top of
    return WHITESPACE_RE.sub(' ', text)


def _squash_artifical_nl(parts):
    output, last_nl = [], False
    for x in parts:
        if x is not None:
            output.append(x)
            last_nl = False
        elif not last_nl:
            output.append(None)
            last_nl = True
    return output


def _strip_artifical_nl(parts):
    if not parts:
        return parts
    for start_idx, pt in enumerate(parts):
        if isinstance(pt, str):
            # 0, 1, 2, index of first string [start_idx:...
            break
    iterator = enumerate(parts[:start_idx - 1 if start_idx > 0 else None:-1])
    for end_idx, pt in iterator:
        if isinstance(pt, str):  # 0=None, 1=-1, 2=-2, index of last string
            break
    return parts[start_idx:-end_idx if end_idx > 0 else None]


def _merge_original_parts(parts):
    output, orp_buf = [], []

    def flush():
        if orp_buf:
            item = squash_html_whitespace(''.join(orp_buf)).strip()
            if item:
                output.append(item)
            orp_buf[:] = []

    for x in parts:
        if not isinstance(x, str):
            flush()
            output.append(x)
        else:
            orp_buf.append(x)
    flush()
    return output


def extract_text_array(dom, squash_artifical_nl=True, strip_artifical_nl=True):
    if callable(dom.tag):
        return ''
    r = []
    if dom.tag in SEPARATORS:
        r.append(True)  # equivalent of '\n' used to designate separators
    elif dom.tag not in INLINE_TAGS:
        # equivalent of '\n' used to designate artificially inserted newlines
        r.append(None)
    if dom.text is not None:
        r.append(dom.text)
    for child in dom.getchildren():
        r.extend(extract_text_array(child, squash_artifical_nl=False,
                                    strip_artifical_nl=False))
        if child.tail is not None:
            r.append(child.tail)
    if dom.tag not in INLINE_TAGS and dom.tag not in SEPARATORS:
        # equivalent of '\n' used to designate artificially inserted newlines
        r.append(None)
    if squash_artifical_nl:
        r = _squash_artifical_nl(r)
    if strip_artifical_nl:
        r = _strip_artifical_nl(r)
    return r


def extract_text(dom, block_symbol='\n', sep_symbol='\n', squash_space=True):
    a = extract_text_array(dom, squash_artifical_nl=squash_space)
    if squash_space:
        a = _strip_artifical_nl(_squash_artifical_nl(_merge_original_parts(a)))
    result = ''.join(
        block_symbol if x is None else (
            sep_symbol if x is True else x
        )
        for x in a
    )
    if squash_space:
        result = result.strip()
    return result