Source code for ppc_robot_lib.utils.strings

import unicodedata
import re

from collections.abc import Iterable


NON_NFKD_MAP = {
    '\u0181': 'B',
    '\u1d81': 'd',
    '\u1d85': 'l',
    '\u1d89': 'r',
    '\u028b': 'v',
    '\u1d8d': 'x',
    '\u1d83': 'g',
    '\u0191': 'F',
    '\u0199': 'k',
    '\u019d': 'N',
    '\u0220': 'N',
    '\u01a5': 'p',
    '\u0224': 'Z',
    '\u0126': 'H',
    '\u01ad': 't',
    '\u01b5': 'Z',
    '\u0234': 'l',
    '\u023c': 'c',
    '\u0240': 'z',
    '\u0142': 'l',
    '\u0244': '',
    '\u2c60': 'L',
    '\u0248': 'J',
    '\ua74a': 'O',
    '\u024c': 'R',
    '\ua752': 'P',
    '\ua756': 'Q',
    '\ua75a': 'R',
    '\ua75e': 'V',
    '\u0260': 'g',
    '\u01e5': 'g',
    '\u2c64': 'R',
    '\u0166': 'T',
    '\u0268': 'i',
    '\u2c66': 't',
    '\u026c': 'l',
    '\u1d6e': 'f',
    '\u1d87': 'n',
    '\u1d72': 'r',
    '\u2c74': 'v',
    '\u1d76': 'z',
    '\u2c78': 'e',
    '\u027c': 'r',
    '\u1eff': 'y',
    '\ua741': 'k',
    '\u0182': 'B',
    '\u1d86': 'm',
    '\u0288': 't',
    '\u018a': 'D',
    '\u1d8e': 'z',
    '\u0111': 'd',
    '\u0290': 'z',
    '\u0192': 'f',
    '\u1d96': 'i',
    '\u019a': 'l',
    '\u019e': 'n',
    '\u1d88': 'p',
    '\u02a0': 'q',
    '\u01ae': 'T',
    '\u01b2': 'V',
    '\u01b6': 'z',
    '\u023b': 'C',
    '\u023f': 's',
    '\u0141': 'L',
    '\u0243': 'B',
    '\ua745': 'k',
    '\u0247': 'e',
    '\ua749': 'l',
    '\u024b': 'q',
    '\ua74d': 'o',
    '\u024f': 'y',
    '\ua751': 'p',
    '\u0253': 'b',
    '\ua755': 'p',
    '\u0257': 'd',
    '\ua759': 'q',
    '\xd8': 'O',
    '\u2c63': 'P',
    '\u2c67': 'H',
    '\u026b': 'l',
    '\u1d6d': 'd',
    '\u1d71': 'p',
    '\u0273': 'n',
    '\u1d75': 't',
    '\u1d91': 'd',
    '\xf8': 'o',
    '\u2c7e': 'S',
    '\u1d7d': 'p',
    '\u2c7f': 'Z',
    '\u0183': 'b',
    '\u0187': 'C',
    '\u1d80': 'b',
    '\u0289': '',
    '\u018b': 'D',
    '\u1d8f': 'a',
    '\u0291': 'z',
    '\u0110': 'D',
    '\u0193': 'G',
    '\u1d82': 'f',
    '\u0197': 'I',
    '\u029d': 'j',
    '\u019f': 'O',
    '\u2c6c': 'z',
    '\u01ab': 't',
    '\u01b3': 'Y',
    '\u0236': 't',
    '\u023a': 'A',
    '\u023e': 'T',
    '\ua740': 'K',
    '\u1d8a': 's',
    '\ua744': 'K',
    '\u0246': 'E',
    '\ua748': 'L',
    '\ua74c': 'O',
    '\u024e': 'Y',
    '\ua750': 'P',
    '\ua754': 'P',
    '\u0256': 'd',
    '\ua758': 'Q',
    '\u2c62': 'L',
    '\u0266': 'h',
    '\u2c73': 'w',
    '\u2c6a': 'k',
    '\u1d6c': 'b',
    '\u2c6e': 'M',
    '\u1d70': 'n',
    '\u0272': 'n',
    '\u1d92': 'e',
    '\u1d74': 's',
    '\u2c7a': 'o',
    '\u2c6b': 'Z',
    '\u027e': 'r',
    '\u0180': 'b',
    '\u0282': 's',
    '\u1d84': 'k',
    '\u0188': 'c',
    '\u018c': 'd',
    '\ua742': 'K',
    '\u1d99': '',
    '\u0198': 'K',
    '\u1d8c': 'v',
    '\u0221': 'd',
    '\u2c71': 'v',
    '\u0225': 'z',
    '\u01a4': 'P',
    '\u0127': 'h',
    '\u01ac': 'T',
    '\u0235': 'n',
    '\u01b4': 'y',
    '\u2c72': 'W',
    '\u023d': 'L',
    '\ua743': 'k',
    '\u0249': 'j',
    '\ua74b': 'o',
    '\u024d': 'r',
    '\ua753': 'p',
    '\u0255': 'c',
    '\ua757': 'q',
    '\u2c68': 'h',
    '\ua75b': 'r',
    '\ua75f': 'v',
    '\u2c61': 'l',
    '\u2c65': 'a',
    '\u01e4': 'G',
    '\u0167': 't',
    '\u2c69': 'K',
    '\u026d': 'l',
    '\u1d6f': 'm',
    '\u0271': 'm',
    '\u1d73': 'r',
    '\u027d': 'r',
    '\u1efe': 'Y',
    '\u1e9e': 'SS',
    '\u00df': 'ss',
    '\u00c6': 'AE',
    '\u00e6': 'ae',
}

emoji = re.compile(
    '['
    '\U0001f600-\U0001f64f'  # emoticons
    '\U0001f300-\U0001f5ff'  # symbols & pictographs
    '\U0001f680-\U0001f6ff'  # transport & map symbols
    '\U0001f1e0-\U0001f1ff'  # flags (iOS)
    '\U00002500-\U00002bef'  # chinese char
    '\U00002702-\U000027b0'
    '\U00002702-\U000027b0'
    '\U000024c2-\U0001f251'
    '\U0001f926-\U0001f937'
    '\U00010000-\U0010ffff'
    '\u2640-\u2642'
    '\u2600-\u2b55'
    '\u200d'
    '\u23cf'
    '\u23e9'
    '\u231a'
    '\ufe0f'  # dingbats
    '\u3030'
    ']+',
    re.UNICODE,
)



[docs]
def remove_accents(string: str) -> str:
    """
    Convert accented characters to their non.accented equivalents. Example::

        >>> remove_accents('Polynésie Française')
        'Polynesie Francaise'
        >>> remove_accents('Lech Wałęsa')
        'Lech Walesa'

    :param string: Input string.
    :return: String without accents.
    """
    # Convert accented chars to their non-accented equivalent.
    # e.g.: Polynésie Française -> Polynesie Francaise
    return ''.join(
        NON_NFKD_MAP[c] if c in NON_NFKD_MAP else c
        for part in unicodedata.normalize('NFKD', string)
        for c in part
        if unicodedata.category(part) != 'Mn'
    )




[docs]
def remove_emojis(text):
    """
    removes emoji from text
    """
    return re.sub(emoji, '', text)




[docs]
class FirstMatchSplitter:
    """
    Advanced first matched separator text splitter.

    **Usage**, see unit tests in :module:`test_first_match_splitter`.
    """

    _separator_sanitizer_pattern = re.compile(r'\s+', flags=re.UNICODE)

    def __init__(
        self, separators: str | Iterable[str], sort_separators=True, remember_separator=False, filter_empty=True
    ):
        """
        :param separators: single or multiple separators trying to match in a value.
        :param sort_separators: sort ``separators`` by length in reverse order to match separators in sentences
                                with better equality as first.
        :param remember_separator: remember first matched separator to be used for every next :meth:`split` call.
        :param filter_empty: filter out empty elements on split (see :meth:`split`).
        """
        if isinstance(separators, str):
            separators = [separators]

        # We filter out empty separators these are not allowed.
        separators = filter(bool, map(self._sanitize_separators, separators))
        if sort_separators:
            separators = sorted(separators, key=len, reverse=True)

        self._separators = tuple(separators)
        if len(self._separators) == 0:
            raise ValueError('No valid separator specified.')

        self._remember_separator = remember_separator
        # First matched separator in a text (see :meth:`split`) if remember_separator is True.
        self._separator = None
        self._filter_empty = filter_empty

    @property
    def remember_separator(self):
        return self._remember_separator

    @remember_separator.setter
    def remember_separator(self, value: bool):
        self._remember_separator = value
        if value is False:
            self._separator = None


[docs]
    def split(self, text: str) -> Iterable[str]:
        """
        Split ``text`` with FIRST MATCHED separator.

        :param text: source text to split with the first matched separators.
        :return: words iterator
        """
        separator = self._match_separator(text)
        if separator is None:
            text = self._sanitize_text(text)
            return [] if not text else [text]

        parts = (self._sanitize_text(part, separator) for part in text.split(separator))
        if self._filter_empty:
            parts = filter(bool, parts)
        return parts


    def _match_separator(self, text: str) -> str | None:
        """
        Find matching separator in a ``text``.
        """
        if self._remember_separator and self._separator is not None:
            return self._separator

        for separator in self._separators:
            if separator in text:
                if self._remember_separator and self._separator is None:
                    self._separator = separator
                return separator
        return None

    @staticmethod
    def _sanitize_text(text: str, separator: str | None = None):
        """
        Remove trailing separator char and whitespaces.
        """
        return text.strip('' if separator is None else separator).strip()

    def _sanitize_separators(self, separator: str):
        return self._separator_sanitizer_pattern.sub('', separator)