Source code for ppc_robot_lib.utils.strings

import unicodedata
import re

from collections.abc import Iterable


NON_NFKD_MAP = {
    '\u0181': 'B',
    '\u1d81': 'd',
    '\u1d85': 'l',
    '\u1d89': 'r',
    '\u028b': 'v',
    '\u1d8d': 'x',
    '\u1d83': 'g',
    '\u0191': 'F',
    '\u0199': 'k',
    '\u019d': 'N',
    '\u0220': 'N',
    '\u01a5': 'p',
    '\u0224': 'Z',
    '\u0126': 'H',
    '\u01ad': 't',
    '\u01b5': 'Z',
    '\u0234': 'l',
    '\u023c': 'c',
    '\u0240': 'z',
    '\u0142': 'l',
    '\u0244': '',
    '\u2c60': 'L',
    '\u0248': 'J',
    '\ua74a': 'O',
    '\u024c': 'R',
    '\ua752': 'P',
    '\ua756': 'Q',
    '\ua75a': 'R',
    '\ua75e': 'V',
    '\u0260': 'g',
    '\u01e5': 'g',
    '\u2c64': 'R',
    '\u0166': 'T',
    '\u0268': 'i',
    '\u2c66': 't',
    '\u026c': 'l',
    '\u1d6e': 'f',
    '\u1d87': 'n',
    '\u1d72': 'r',
    '\u2c74': 'v',
    '\u1d76': 'z',
    '\u2c78': 'e',
    '\u027c': 'r',
    '\u1eff': 'y',
    '\ua741': 'k',
    '\u0182': 'B',
    '\u1d86': 'm',
    '\u0288': 't',
    '\u018a': 'D',
    '\u1d8e': 'z',
    '\u0111': 'd',
    '\u0290': 'z',
    '\u0192': 'f',
    '\u1d96': 'i',
    '\u019a': 'l',
    '\u019e': 'n',
    '\u1d88': 'p',
    '\u02a0': 'q',
    '\u01ae': 'T',
    '\u01b2': 'V',
    '\u01b6': 'z',
    '\u023b': 'C',
    '\u023f': 's',
    '\u0141': 'L',
    '\u0243': 'B',
    '\ua745': 'k',
    '\u0247': 'e',
    '\ua749': 'l',
    '\u024b': 'q',
    '\ua74d': 'o',
    '\u024f': 'y',
    '\ua751': 'p',
    '\u0253': 'b',
    '\ua755': 'p',
    '\u0257': 'd',
    '\ua759': 'q',
    '\xd8': 'O',
    '\u2c63': 'P',
    '\u2c67': 'H',
    '\u026b': 'l',
    '\u1d6d': 'd',
    '\u1d71': 'p',
    '\u0273': 'n',
    '\u1d75': 't',
    '\u1d91': 'd',
    '\xf8': 'o',
    '\u2c7e': 'S',
    '\u1d7d': 'p',
    '\u2c7f': 'Z',
    '\u0183': 'b',
    '\u0187': 'C',
    '\u1d80': 'b',
    '\u0289': '',
    '\u018b': 'D',
    '\u1d8f': 'a',
    '\u0291': 'z',
    '\u0110': 'D',
    '\u0193': 'G',
    '\u1d82': 'f',
    '\u0197': 'I',
    '\u029d': 'j',
    '\u019f': 'O',
    '\u2c6c': 'z',
    '\u01ab': 't',
    '\u01b3': 'Y',
    '\u0236': 't',
    '\u023a': 'A',
    '\u023e': 'T',
    '\ua740': 'K',
    '\u1d8a': 's',
    '\ua744': 'K',
    '\u0246': 'E',
    '\ua748': 'L',
    '\ua74c': 'O',
    '\u024e': 'Y',
    '\ua750': 'P',
    '\ua754': 'P',
    '\u0256': 'd',
    '\ua758': 'Q',
    '\u2c62': 'L',
    '\u0266': 'h',
    '\u2c73': 'w',
    '\u2c6a': 'k',
    '\u1d6c': 'b',
    '\u2c6e': 'M',
    '\u1d70': 'n',
    '\u0272': 'n',
    '\u1d92': 'e',
    '\u1d74': 's',
    '\u2c7a': 'o',
    '\u2c6b': 'Z',
    '\u027e': 'r',
    '\u0180': 'b',
    '\u0282': 's',
    '\u1d84': 'k',
    '\u0188': 'c',
    '\u018c': 'd',
    '\ua742': 'K',
    '\u1d99': '',
    '\u0198': 'K',
    '\u1d8c': 'v',
    '\u0221': 'd',
    '\u2c71': 'v',
    '\u0225': 'z',
    '\u01a4': 'P',
    '\u0127': 'h',
    '\u01ac': 'T',
    '\u0235': 'n',
    '\u01b4': 'y',
    '\u2c72': 'W',
    '\u023d': 'L',
    '\ua743': 'k',
    '\u0249': 'j',
    '\ua74b': 'o',
    '\u024d': 'r',
    '\ua753': 'p',
    '\u0255': 'c',
    '\ua757': 'q',
    '\u2c68': 'h',
    '\ua75b': 'r',
    '\ua75f': 'v',
    '\u2c61': 'l',
    '\u2c65': 'a',
    '\u01e4': 'G',
    '\u0167': 't',
    '\u2c69': 'K',
    '\u026d': 'l',
    '\u1d6f': 'm',
    '\u0271': 'm',
    '\u1d73': 'r',
    '\u027d': 'r',
    '\u1efe': 'Y',
    '\u1e9e': 'SS',
    '\u00df': 'ss',
    '\u00c6': 'AE',
    '\u00e6': 'ae',
}

emoji = re.compile(
    '['
    '\U0001f600-\U0001f64f'  # emoticons
    '\U0001f300-\U0001f5ff'  # symbols & pictographs
    '\U0001f680-\U0001f6ff'  # transport & map symbols
    '\U0001f1e0-\U0001f1ff'  # flags (iOS)
    '\U00002500-\U00002bef'  # chinese char
    '\U00002702-\U000027b0'
    '\U00002702-\U000027b0'
    '\U000024c2-\U0001f251'
    '\U0001f926-\U0001f937'
    '\U00010000-\U0010ffff'
    '\u2640-\u2642'
    '\u2600-\u2b55'
    '\u200d'
    '\u23cf'
    '\u23e9'
    '\u231a'
    '\ufe0f'  # dingbats
    '\u3030'
    ']+',
    re.UNICODE,
)


[docs] def remove_accents(string: str) -> str: """ Convert accented characters to their non.accented equivalents. Example:: >>> remove_accents('Polynésie Française') 'Polynesie Francaise' >>> remove_accents('Lech Wałęsa') 'Lech Walesa' :param string: Input string. :return: String without accents. """ # Convert accented chars to their non-accented equivalent. # e.g.: Polynésie Française -> Polynesie Francaise return ''.join( NON_NFKD_MAP[c] if c in NON_NFKD_MAP else c for part in unicodedata.normalize('NFKD', string) for c in part if unicodedata.category(part) != 'Mn' )
[docs] def remove_emojis(text): """ removes emoji from text """ return re.sub(emoji, '', text)
[docs] class FirstMatchSplitter: """ Advanced first matched separator text splitter. **Usage**, see unit tests in :module:`test_first_match_splitter`. """ _separator_sanitizer_pattern = re.compile(r'\s+', flags=re.UNICODE) def __init__( self, separators: str | Iterable[str], sort_separators=True, remember_separator=False, filter_empty=True ): """ :param separators: single or multiple separators trying to match in a value. :param sort_separators: sort ``separators`` by length in reverse order to match separators in sentences with better equality as first. :param remember_separator: remember first matched separator to be used for every next :meth:`split` call. :param filter_empty: filter out empty elements on split (see :meth:`split`). """ if isinstance(separators, str): separators = [separators] # We filter out empty separators these are not allowed. separators = filter(bool, map(self._sanitize_separators, separators)) if sort_separators: separators = sorted(separators, key=len, reverse=True) self._separators = tuple(separators) if len(self._separators) == 0: raise ValueError('No valid separator specified.') self._remember_separator = remember_separator # First matched separator in a text (see :meth:`split`) if remember_separator is True. self._separator = None self._filter_empty = filter_empty @property def remember_separator(self): return self._remember_separator @remember_separator.setter def remember_separator(self, value: bool): self._remember_separator = value if value is False: self._separator = None
[docs] def split(self, text: str) -> Iterable[str]: """ Split ``text`` with FIRST MATCHED separator. :param text: source text to split with the first matched separators. :return: words iterator """ separator = self._match_separator(text) if separator is None: text = self._sanitize_text(text) return [] if not text else [text] parts = (self._sanitize_text(part, separator) for part in text.split(separator)) if self._filter_empty: parts = filter(bool, parts) return parts
def _match_separator(self, text: str) -> str | None: """ Find matching separator in a ``text``. """ if self._remember_separator and self._separator is not None: return self._separator for separator in self._separators: if separator in text: if self._remember_separator and self._separator is None: self._separator = separator return separator return None @staticmethod def _sanitize_text(text: str, separator: str | None = None): """ Remove trailing separator char and whitespaces. """ return text.strip('' if separator is None else separator).strip() def _sanitize_separators(self, separator: str): return self._separator_sanitizer_pattern.sub('', separator)