import unicodedata
import re
from collections.abc import Iterable
NON_NFKD_MAP = {
'\u0181': 'B',
'\u1d81': 'd',
'\u1d85': 'l',
'\u1d89': 'r',
'\u028b': 'v',
'\u1d8d': 'x',
'\u1d83': 'g',
'\u0191': 'F',
'\u0199': 'k',
'\u019d': 'N',
'\u0220': 'N',
'\u01a5': 'p',
'\u0224': 'Z',
'\u0126': 'H',
'\u01ad': 't',
'\u01b5': 'Z',
'\u0234': 'l',
'\u023c': 'c',
'\u0240': 'z',
'\u0142': 'l',
'\u0244': '',
'\u2c60': 'L',
'\u0248': 'J',
'\ua74a': 'O',
'\u024c': 'R',
'\ua752': 'P',
'\ua756': 'Q',
'\ua75a': 'R',
'\ua75e': 'V',
'\u0260': 'g',
'\u01e5': 'g',
'\u2c64': 'R',
'\u0166': 'T',
'\u0268': 'i',
'\u2c66': 't',
'\u026c': 'l',
'\u1d6e': 'f',
'\u1d87': 'n',
'\u1d72': 'r',
'\u2c74': 'v',
'\u1d76': 'z',
'\u2c78': 'e',
'\u027c': 'r',
'\u1eff': 'y',
'\ua741': 'k',
'\u0182': 'B',
'\u1d86': 'm',
'\u0288': 't',
'\u018a': 'D',
'\u1d8e': 'z',
'\u0111': 'd',
'\u0290': 'z',
'\u0192': 'f',
'\u1d96': 'i',
'\u019a': 'l',
'\u019e': 'n',
'\u1d88': 'p',
'\u02a0': 'q',
'\u01ae': 'T',
'\u01b2': 'V',
'\u01b6': 'z',
'\u023b': 'C',
'\u023f': 's',
'\u0141': 'L',
'\u0243': 'B',
'\ua745': 'k',
'\u0247': 'e',
'\ua749': 'l',
'\u024b': 'q',
'\ua74d': 'o',
'\u024f': 'y',
'\ua751': 'p',
'\u0253': 'b',
'\ua755': 'p',
'\u0257': 'd',
'\ua759': 'q',
'\xd8': 'O',
'\u2c63': 'P',
'\u2c67': 'H',
'\u026b': 'l',
'\u1d6d': 'd',
'\u1d71': 'p',
'\u0273': 'n',
'\u1d75': 't',
'\u1d91': 'd',
'\xf8': 'o',
'\u2c7e': 'S',
'\u1d7d': 'p',
'\u2c7f': 'Z',
'\u0183': 'b',
'\u0187': 'C',
'\u1d80': 'b',
'\u0289': '',
'\u018b': 'D',
'\u1d8f': 'a',
'\u0291': 'z',
'\u0110': 'D',
'\u0193': 'G',
'\u1d82': 'f',
'\u0197': 'I',
'\u029d': 'j',
'\u019f': 'O',
'\u2c6c': 'z',
'\u01ab': 't',
'\u01b3': 'Y',
'\u0236': 't',
'\u023a': 'A',
'\u023e': 'T',
'\ua740': 'K',
'\u1d8a': 's',
'\ua744': 'K',
'\u0246': 'E',
'\ua748': 'L',
'\ua74c': 'O',
'\u024e': 'Y',
'\ua750': 'P',
'\ua754': 'P',
'\u0256': 'd',
'\ua758': 'Q',
'\u2c62': 'L',
'\u0266': 'h',
'\u2c73': 'w',
'\u2c6a': 'k',
'\u1d6c': 'b',
'\u2c6e': 'M',
'\u1d70': 'n',
'\u0272': 'n',
'\u1d92': 'e',
'\u1d74': 's',
'\u2c7a': 'o',
'\u2c6b': 'Z',
'\u027e': 'r',
'\u0180': 'b',
'\u0282': 's',
'\u1d84': 'k',
'\u0188': 'c',
'\u018c': 'd',
'\ua742': 'K',
'\u1d99': '',
'\u0198': 'K',
'\u1d8c': 'v',
'\u0221': 'd',
'\u2c71': 'v',
'\u0225': 'z',
'\u01a4': 'P',
'\u0127': 'h',
'\u01ac': 'T',
'\u0235': 'n',
'\u01b4': 'y',
'\u2c72': 'W',
'\u023d': 'L',
'\ua743': 'k',
'\u0249': 'j',
'\ua74b': 'o',
'\u024d': 'r',
'\ua753': 'p',
'\u0255': 'c',
'\ua757': 'q',
'\u2c68': 'h',
'\ua75b': 'r',
'\ua75f': 'v',
'\u2c61': 'l',
'\u2c65': 'a',
'\u01e4': 'G',
'\u0167': 't',
'\u2c69': 'K',
'\u026d': 'l',
'\u1d6f': 'm',
'\u0271': 'm',
'\u1d73': 'r',
'\u027d': 'r',
'\u1efe': 'Y',
'\u1e9e': 'SS',
'\u00df': 'ss',
'\u00c6': 'AE',
'\u00e6': 'ae',
}
emoji = re.compile(
'['
'\U0001f600-\U0001f64f' # emoticons
'\U0001f300-\U0001f5ff' # symbols & pictographs
'\U0001f680-\U0001f6ff' # transport & map symbols
'\U0001f1e0-\U0001f1ff' # flags (iOS)
'\U00002500-\U00002bef' # chinese char
'\U00002702-\U000027b0'
'\U00002702-\U000027b0'
'\U000024c2-\U0001f251'
'\U0001f926-\U0001f937'
'\U00010000-\U0010ffff'
'\u2640-\u2642'
'\u2600-\u2b55'
'\u200d'
'\u23cf'
'\u23e9'
'\u231a'
'\ufe0f' # dingbats
'\u3030'
']+',
re.UNICODE,
)
[docs]
def remove_accents(string: str) -> str:
"""
Convert accented characters to their non.accented equivalents. Example::
>>> remove_accents('Polynésie Française')
'Polynesie Francaise'
>>> remove_accents('Lech Wałęsa')
'Lech Walesa'
:param string: Input string.
:return: String without accents.
"""
# Convert accented chars to their non-accented equivalent.
# e.g.: Polynésie Française -> Polynesie Francaise
return ''.join(
NON_NFKD_MAP[c] if c in NON_NFKD_MAP else c
for part in unicodedata.normalize('NFKD', string)
for c in part
if unicodedata.category(part) != 'Mn'
)
[docs]
def remove_emojis(text):
"""
removes emoji from text
"""
return re.sub(emoji, '', text)
[docs]
class FirstMatchSplitter:
"""
Advanced first matched separator text splitter.
**Usage**, see unit tests in :module:`test_first_match_splitter`.
"""
_separator_sanitizer_pattern = re.compile(r'\s+', flags=re.UNICODE)
def __init__(
self, separators: str | Iterable[str], sort_separators=True, remember_separator=False, filter_empty=True
):
"""
:param separators: single or multiple separators trying to match in a value.
:param sort_separators: sort ``separators`` by length in reverse order to match separators in sentences
with better equality as first.
:param remember_separator: remember first matched separator to be used for every next :meth:`split` call.
:param filter_empty: filter out empty elements on split (see :meth:`split`).
"""
if isinstance(separators, str):
separators = [separators]
# We filter out empty separators these are not allowed.
separators = filter(bool, map(self._sanitize_separators, separators))
if sort_separators:
separators = sorted(separators, key=len, reverse=True)
self._separators = tuple(separators)
if len(self._separators) == 0:
raise ValueError('No valid separator specified.')
self._remember_separator = remember_separator
# First matched separator in a text (see :meth:`split`) if remember_separator is True.
self._separator = None
self._filter_empty = filter_empty
@property
def remember_separator(self):
return self._remember_separator
@remember_separator.setter
def remember_separator(self, value: bool):
self._remember_separator = value
if value is False:
self._separator = None
[docs]
def split(self, text: str) -> Iterable[str]:
"""
Split ``text`` with FIRST MATCHED separator.
:param text: source text to split with the first matched separators.
:return: words iterator
"""
separator = self._match_separator(text)
if separator is None:
text = self._sanitize_text(text)
return [] if not text else [text]
parts = (self._sanitize_text(part, separator) for part in text.split(separator))
if self._filter_empty:
parts = filter(bool, parts)
return parts
def _match_separator(self, text: str) -> str | None:
"""
Find matching separator in a ``text``.
"""
if self._remember_separator and self._separator is not None:
return self._separator
for separator in self._separators:
if separator in text:
if self._remember_separator and self._separator is None:
self._separator = separator
return separator
return None
@staticmethod
def _sanitize_text(text: str, separator: str | None = None):
"""
Remove trailing separator char and whitespaces.
"""
return text.strip('' if separator is None else separator).strip()
def _sanitize_separators(self, separator: str):
return self._separator_sanitizer_pattern.sub('', separator)