Python unicodedata.combining() Examples

The following are 30 code examples of unicodedata.combining(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module unicodedata , or try the search function .
Example #1
Source File: text.py    From bioforum with MIT License 6 votes vote down vote up
def _text_chars(self, length, truncate, text, truncate_len):
        """Truncate a string after a certain number of chars."""
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
                continue
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],
                                                truncate)

        # Return the original string since no truncation was necessary
        return text 
Example #2
Source File: STFIWF.py    From 2016CCF-sougou with Apache License 2.0 6 votes vote down vote up
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)]) 
Example #3
Source File: cpp_lint.py    From Deep-Exemplar-based-Colorization with MIT License 6 votes vote down vote up
def GetLineWidth(line):
  """Determines the width of the line in column positions.

  Args:
    line: A string, which may be a Unicode string.

  Returns:
    The width of the line in column positions, accounting for Unicode
    combining characters and wide characters.
  """
  if isinstance(line, unicode):
    width = 0
    for uc in unicodedata.normalize('NFC', line):
      if unicodedata.east_asian_width(uc) in ('W', 'F'):
        width += 2
      elif not unicodedata.combining(uc):
        width += 1
    return width
  else:
    return len(line) 
Example #4
Source File: albert_tokenization.py    From bert-for-tf2 with MIT License 6 votes vote down vote up
def preprocess_text(inputs, remove_space=True, lower=False):
    """preprocess data by removing extra space and normalize data."""
    outputs = inputs
    if remove_space:
        outputs = " ".join(inputs.strip().split())

    if six.PY2 and isinstance(outputs, str):
        try:
            outputs = six.ensure_text(outputs, "utf-8")
        except UnicodeDecodeError:
            outputs = six.ensure_text(outputs, "latin-1")

    outputs = unicodedata.normalize("NFKD", outputs)
    outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
    if lower:
        outputs = outputs.lower()

    return outputs 
Example #5
Source File: text.py    From GTDWeb with GNU General Public License v2.0 6 votes vote down vote up
def chars(self, num, truncate=None, html=False):
        """
        Returns the text truncated to be no longer than the specified number
        of characters.

        Takes an optional argument of what should be used to notify that the
        string has been truncated, defaulting to a translatable string of an
        ellipsis (...).
        """
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
                    break
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len) 
Example #6
Source File: text.py    From GTDWeb with GNU General Public License v2.0 6 votes vote down vote up
def _text_chars(self, length, truncate, text, truncate_len):
        """
        Truncates a string after a certain number of chars.
        """
        s_len = 0
        end_index = None
        for i, char in enumerate(text):
            if unicodedata.combining(char):
                # Don't consider combining characters
                # as adding to the string length
                continue
            s_len += 1
            if end_index is None and s_len > truncate_len:
                end_index = i
            if s_len > length:
                # Return the truncated string
                return self.add_truncation_text(text[:end_index or 0],
                                                truncate)

        # Return the original string since no truncation was necessary
        return text 
Example #7
Source File: test_regressions.py    From ironpython2 with Apache License 2.0 6 votes vote down vote up
def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '') 
Example #8
Source File: text.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def strip_accents_unicode(s):
    """Transform accentuated unicode symbols into their simple counterpart

    Warning: the python-level loop and join operations make this
    implementation 20 times slower than the strip_accents_ascii basic
    normalization.

    Parameters
    ----------
    s : string
        The string to strip

    See also
    --------
    strip_accents_ascii
        Remove accentuated char for any unicode symbol that has a direct
        ASCII equivalent.
    """
    normalized = unicodedata.normalize('NFKD', s)
    if normalized == s:
        return s
    else:
        return ''.join([c for c in normalized if not unicodedata.combining(c)]) 
Example #9
Source File: prepro_utils.py    From XLnet-gen with MIT License 6 votes vote down vote up
def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
  if remove_space:
    outputs = ' '.join(inputs.strip().split())
  else:
    outputs = inputs
  outputs = outputs.replace("``", '"').replace("''", '"')

  if six.PY2 and isinstance(outputs, str):
    outputs = outputs.decode('utf-8')

  if not keep_accents:
    outputs = unicodedata.normalize('NFKD', outputs)
    outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
  if lower:
    outputs = outputs.lower()

  return outputs 
Example #10
Source File: text.py    From bioforum with MIT License 6 votes vote down vote up
def chars(self, num, truncate=None, html=False):
        """
        Return the text truncated to be no longer than the specified number
        of characters.

        `truncate` specifies what should be used to notify that the string has
        been truncated, defaulting to a translatable string of an ellipsis
        (...).
        """
        self._setup()
        length = int(num)
        text = unicodedata.normalize('NFC', self._wrapped)

        # Calculate the length to truncate to (max length - end_text length)
        truncate_len = length
        for char in self.add_truncation_text('', truncate):
            if not unicodedata.combining(char):
                truncate_len -= 1
                if truncate_len == 0:
                    break
        if html:
            return self._truncate_html(length, truncate, text, truncate_len, False)
        return self._text_chars(length, truncate, text, truncate_len) 
Example #11
Source File: texttable.py    From deepWordBug with Apache License 2.0 5 votes vote down vote up
def len(iterable):
    """Redefining len here so it will be able to work with non-ASCII characters
    """
    if isinstance(iterable, bytes_type) or isinstance(iterable, unicode_type):
        unicode_data = obj2unicode(iterable)
        if hasattr(unicodedata, 'east_asian_width'):
            w = unicodedata.east_asian_width
            return sum([w(c) in 'WF' and 2 or 0 if unicodedata.combining(c) else 1 for c in unicode_data])
        else:
            return unicode_data.__len__()
    else:
        return iterable.__len__() 
Example #12
Source File: core.py    From pex with Apache License 2.0 5 votes vote down vote up
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True 
Example #13
Source File: core.py    From pex with Apache License 2.0 5 votes vote down vote up
def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v 
Example #14
Source File: __init__.py    From deepWordBug with Apache License 2.0 5 votes vote down vote up
def column_indices(text):
    """Indices of Unicode string `text` when skipping combining characters.

    >>> from docutils.utils import column_indices
    >>> column_indices(u'A t̆ab̆lĕ')
    [0, 1, 2, 4, 5, 7, 8]

    """
    # TODO: account for asian wide chars here instead of using dummy
    # replacements in the tableparser?
    string_indices = list(range(len(text)))
    for index in find_combining_chars(text):
        string_indices[index] = None
    return [i for i in string_indices if i is not None] 
Example #15
Source File: core.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True 
Example #16
Source File: core.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v 
Example #17
Source File: plugin.py    From limnoria-plugins with Do What The F*ck You Want To Public License 5 votes vote down vote up
def removeAccents(self, text):
        text = str(text)
        normalized = unicodedata.normalize("NFKD", text)
        normalized = "".join([c for c in normalized if not unicodedata.combining(c)])
        return normalized 
Example #18
Source File: mnemonic.py    From torba with MIT License 5 votes vote down vote up
def normalize_text(seed):
    seed = unicodedata.normalize('NFKD', seed)
    seed = seed.lower()
    # remove accents
    seed = u''.join([c for c in seed if not unicodedata.combining(c)])
    # normalize whitespaces
    seed = u' '.join(seed.split())
    # remove whitespaces between CJK
    seed = u''.join([
        seed[i] for i in range(len(seed))
        if not (seed[i] in string.whitespace and is_cjk(seed[i-1]) and is_cjk(seed[i+1]))
    ])
    return seed 
Example #19
Source File: core.py    From chinese-support-redux with GNU General Public License v3.0 5 votes vote down vote up
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True 
Example #20
Source File: core.py    From chinese-support-redux with GNU General Public License v3.0 5 votes vote down vote up
def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v 
Example #21
Source File: utils.py    From verejne.digital with Apache License 2.0 5 votes vote down vote up
def remove_accents(s):
    s_NFKD = unicodedata.normalize('NFKD', s)
    return ''.join([c for c in s_NFKD if not unicodedata.combining(c)]) 
Example #22
Source File: utils.py    From verejne.digital with Apache License 2.0 5 votes vote down vote up
def remove_accents(s):
    s_NFKD = unicodedata.normalize('NFKD', s)
    return u''.join([c for c in s_NFKD if not unicodedata.combining(c)]) 
Example #23
Source File: __init__.py    From deepWordBug with Apache License 2.0 5 votes vote down vote up
def column_width(text):
    """Return the column width of text.

    Correct ``len(text)`` for wide East Asian and combining Unicode chars.
    """
    if isinstance(text, str) and sys.version_info < (3,0):
        return len(text)
    try:
        width = sum([east_asian_widths[unicodedata.east_asian_width(c)]
                     for c in text])
    except AttributeError:  # east_asian_width() New in version 2.4.
        width = len(text)
    # correction for combining chars:
    width -= len(find_combining_chars(text))
    return width 
Example #24
Source File: core.py    From oss-ftp with MIT License 5 votes vote down vote up
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True 
Example #25
Source File: core.py    From oss-ftp with MIT License 5 votes vote down vote up
def _combining_class(cp):
    return unicodedata.combining(unichr(cp)) 
Example #26
Source File: core.py    From wow-addon-updater with GNU General Public License v3.0 5 votes vote down vote up
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True 
Example #27
Source File: core.py    From wow-addon-updater with GNU General Public License v3.0 5 votes vote down vote up
def _combining_class(cp):
    return unicodedata.combining(unichr(cp)) 
Example #28
Source File: core.py    From kahoot-hack with GNU General Public License v3.0 5 votes vote down vote up
def _combining_class(cp):
    return unicodedata.combining(unichr(cp)) 
Example #29
Source File: core.py    From kahoot-hack with GNU General Public License v3.0 5 votes vote down vote up
def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True 
Example #30
Source File: core.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v