Python unicodedata.lookup() Examples

The following are 30 code examples of unicodedata.lookup(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module unicodedata , or try the search function .
Example #1
Source File: paraparser.py    From stdm with GNU General Public License v2.0 6 votes vote down vote up
def start_unichar(self, attr):
        if 'name' in attr:
            if 'code' in attr:
                self._syntax_error('<unichar/> invalid with both name and code attributes')
            try:
                v = unicodedata.lookup(attr['name']).encode('utf8')
            except KeyError:
                self._syntax_error('<unichar/> invalid name attribute\n"%s"' % name)
                v = '\0'
        elif 'code' in attr:
            try:
                v = unichr(int(eval(attr['code']))).encode('utf8')
            except:
                self._syntax_error('<unichar/> invalid code attribute %s' % attr['code'])
                v = '\0'
        else:
            v = None
            if attr:
                self._syntax_error('<unichar/> invalid attribute %s' % attr.keys()[0])

        if v is not None:
            self.handle_data(v)
        self._push(_selfClosingTag='unichar') 
Example #2
Source File: python.py    From pyspelling with MIT License 6 votes vote down vote up
def replace_unicode(self, m):
        """Replace escapes."""

        groups = m.groupdict()
        esc = m.group(0)
        if groups.get('fesc'):
            value = m.group(0)
        elif groups.get('format'):
            value = ' '
        elif groups.get('special'):
            value = BACK_SLASH_TRANSLATION[esc]
        elif groups.get('char'):
            try:
                value = chr(int(esc[2:], 16))
            except Exception:
                value = esc
        elif groups.get('oct'):
            value = chr(int(esc[1:], 8))
        elif groups.get('name'):
            try:
                value = unicodedata.lookup(esc[3:-1])
            except Exception:
                value = esc
        return value.replace('\x00', '\n') 
Example #3
Source File: paraparser.py    From Fluid-Designer with GNU General Public License v3.0 6 votes vote down vote up
def start_unichar(self, attr):
        if 'name' in attr:
            if 'code' in attr:
                self._syntax_error('<unichar/> invalid with both name and code attributes')
            try:
                v = unicodedata.lookup(attr['name'])
            except KeyError:
                self._syntax_error('<unichar/> invalid name attribute\n"%s"' % ascii(attr['name']))
                v = '\0'
        elif 'code' in attr:
            try:
                v = int(eval(attr['code']))
                v = chr(v) if isPy3 else unichr(v)
            except:
                self._syntax_error('<unichar/> invalid code attribute %s' % ascii(attr['code']))
                v = '\0'
        else:
            v = None
            if attr:
                self._syntax_error('<unichar/> invalid attribute %s' % list(attr.keys())[0])

        if v is not None:
            self.handle_data(v)
        self._push('unichar',_selfClosingTag='unichar') 
Example #4
Source File: utilities.py    From particle with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def greek_letter_name_to_unicode(letter):
    # type: (str) -> str
    """
    Return a greek letter name as a Unicode character.

    Examples
    --------
    Lamda -> Λ    (Unicodedata library uses "lamda" for "lambda" :S!)
    Omega -> Ω
    omega -> ω
    """
    return unicodedata.lookup(
        "GREEK {case} LETTER {name}".format(
            case="SMALL" if letter == letter.lower() else "CAPITAL", name=letter.upper()
        )
    ) 
Example #5
Source File: test_re_jy.py    From CTFCrackTools with GNU General Public License v3.0 6 votes vote down vote up
def test_unicode_whitespace(self):
        # Test for http://bugs.jython.org/issue2226
        ws_re = re.compile(r'\s', re.UNICODE)
        not_ws_re = re.compile(r'\S', re.UNICODE)
        separator_categories = set(['Zl', 'Zp', 'Zs'])
        separators = {chr(c) for c in [28, 29, 30, 31]}
        special = set([
            unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
            u'\u0085', # NEXT LINE (NEL)
            ])
        cpython_whitespace = set(' \t\n\r\f\v') | separators | special
        for i in xrange(0xFFFF): # could test to sys.maxunicode, but does not appear to be necessary
            if i >= 0xD800 and i <= 0xDFFF:
                continue
            c = unichr(i)
            if c in cpython_whitespace or category(c) in separator_categories:
                self.assertRegexpMatches(c, ws_re)
                self.assertNotRegexpMatches(c, not_ws_re)
            else:
                self.assertNotRegexpMatches(c, ws_re)
                self.assertRegexpMatches(c, not_ws_re) 
Example #6
Source File: latex2text.py    From Menotexport with GNU General Public License v3.0 6 votes vote down vote up
def _greekletters(letterlist):
    for l in letterlist:
        ucharname = l.upper()
        if (ucharname == 'LAMBDA'):
            ucharname = 'LAMDA'
        smallname = "GREEK SMALL LETTER "+ucharname;
        if (ucharname == 'EPSILON'):
            smallname = "GREEK LUNATE EPSILON SYMBOL"
        if (ucharname == 'PHI'):
            smallname = "GREEK PHI SYMBOL"
        _default_macro_list.append(
            (l, unicodedata.lookup(smallname))
            );
        _default_macro_list.append(
            (l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
            ); 
Example #7
Source File: test_ucn.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 6 votes vote down vote up
def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias) 
Example #8
Source File: test_ucn.py    From Fluid-Designer with GNU General Public License v3.0 6 votes vote down vote up
def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias) 
Example #9
Source File: test_ucn.py    From ironpython3 with Apache License 2.0 6 votes vote down vote up
def test_named_sequences_full(self):
        # Check all the named sequences
        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
               unicodedata.unidata_version)
        try:
            testdata = support.open_urlresource(url, encoding="utf-8",
                                                check=check_version)
        except (OSError, HTTPException):
            self.skipTest("Could not retrieve " + url)
        self.addCleanup(testdata.close)
        for line in testdata:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            seqname, codepoints = line.split(';')
            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname) 
Example #10
Source File: test_ucn.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 6 votes vote down vote up
def test_named_sequences_full(self):
        # Check all the named sequences
        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
               unicodedata.unidata_version)
        try:
            testdata = support.open_urlresource(url, encoding="utf-8",
                                                check=check_version)
        except (OSError, HTTPException):
            self.skipTest("Could not retrieve " + url)
        self.addCleanup(testdata.close)
        for line in testdata:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            seqname, codepoints = line.split(';')
            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname) 
Example #11
Source File: test_re_jy.py    From CTFCrackTools-V2 with GNU General Public License v3.0 6 votes vote down vote up
def test_unicode_whitespace(self):
        # Test for http://bugs.jython.org/issue2226
        ws_re = re.compile(r'\s', re.UNICODE)
        not_ws_re = re.compile(r'\S', re.UNICODE)
        separator_categories = set(['Zl', 'Zp', 'Zs'])
        separators = {chr(c) for c in [28, 29, 30, 31]}
        special = set([
            unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
            u'\u0085', # NEXT LINE (NEL)
            ])
        cpython_whitespace = set(' \t\n\r\f\v') | separators | special
        for i in xrange(0xFFFF): # could test to sys.maxunicode, but does not appear to be necessary
            if i >= 0xD800 and i <= 0xDFFF:
                continue
            c = unichr(i)
            if c in cpython_whitespace or category(c) in separator_categories:
                self.assertRegexpMatches(c, ws_re)
                self.assertNotRegexpMatches(c, not_ws_re)
            else:
                self.assertNotRegexpMatches(c, ws_re)
                self.assertRegexpMatches(c, not_ws_re) 
Example #12
Source File: _defaultspecs.py    From pylatexenc with MIT License 6 votes vote down vote up
def _greekletters(letterlist):
    for l in letterlist:
        ucharname = l.upper()
        if ucharname == 'LAMBDA':
            ucharname = 'LAMDA'
        smallname = "GREEK SMALL LETTER "+ucharname
        if ucharname == 'EPSILON':
            smallname = "GREEK LUNATE EPSILON SYMBOL"
        if ucharname == 'PHI':
            smallname = "GREEK PHI SYMBOL"
        _latex_specs_base['macros'].append(
            MacroTextSpec(l, unicodedata.lookup(smallname))
        )
        _latex_specs_base['macros'].append(
            MacroTextSpec(l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
            ) 
Example #13
Source File: test_ucn.py    From ironpython3 with Apache License 2.0 6 votes vote down vote up
def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias) 
Example #14
Source File: test_ucn.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def test_bmp_characters(self):
        import unicodedata
        count = 0
        for code in xrange(0x10000):
            char = unichr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
                count += 1 
Example #15
Source File: test_reshape.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_unicode(self, sparse):
        # See GH 6885 - get_dummies chokes on unicode values
        import unicodedata
        e = 'e'
        eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
        s = [e, eacute, eacute]
        res = get_dummies(s, prefix='letter', sparse=sparse)
        exp = DataFrame({'letter_e': [1, 0, 0],
                         u('letter_%s') % eacute: [0, 1, 1]},
                        dtype=np.uint8)
        assert_frame_equal(res, exp) 
Example #16
Source File: test_ucn.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown') 
Example #17
Source File: _base.py    From MIA-Dictionary-Addon with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, vk=None, char=None, is_dead=False):
        self.vk = vk
        self.char = six.text_type(char) if char is not None else None
        self.is_dead = is_dead

        if self.is_dead:
            self.combining = unicodedata.lookup(
                'COMBINING ' + unicodedata.name(self.char))
            if not self.combining:
                raise KeyError(char)
        else:
            self.combining = None 
Example #18
Source File: test_ucn.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def test_ascii_letters(self):
        import unicodedata

        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name) 
Example #19
Source File: test_reshape.py    From coffeegrindsize with MIT License 5 votes vote down vote up
def test_unicode(self, sparse):
        # See GH 6885 - get_dummies chokes on unicode values
        import unicodedata
        e = 'e'
        eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
        s = [e, eacute, eacute]
        res = get_dummies(s, prefix='letter', sparse=sparse)
        exp = DataFrame({'letter_e': [1, 0, 0],
                         u('letter_%s') % eacute: [0, 1, 1]},
                        dtype=np.uint8)
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0)
        assert_frame_equal(res, exp) 
Example #20
Source File: glyphIgo.py    From glyphIgo with MIT License 5 votes vote down vote up
def __lookup_character(self):
        results = []
        query = self.__args.character
        if ("heuristic" in self.__args):
            # try fuzzy match
            qw = query.upper().split(" ")
            effective_qw = []
            for q in qw:
                if (len(q) > 0):
                    effective_qw.append(q)
            # Unicode codepoints range from 0 to 0x10FFFF = 1114111
            for i in range(1114112):
                c = unichr(i)
                name = unicodedata.name(c, "UNKNOWN").split(" ")
                is_match = True
                for e in effective_qw:
                    if (not (e in name)):
                        is_match = False
                        break
                if (is_match):
                    results.append(c)
        else:
            # try char, codepoint or exact name lookup
            if (len(query) == 1):
                # Unicode char
                results = [ u"" + query ]
            elif (re.match(self.PATTERN_HEX_0x, query) != None):
                # hex
                results = [ unichr(int(query[2:], 16)) ]
            elif (re.match(self.PATTERN_HEX_x, query) != None):
                # hex
                results = [ unichr(int(query[1:], 16)) ]
            elif (re.match(self.PATTERN_DEC, query) != None):
                # decimal
                results = [ unichr(int(query)) ]
            else: 
                # exact name
                results = [ unicodedata.lookup(query) ]
        return results 
Example #21
Source File: util.py    From odoo13-x64 with GNU General Public License v3.0 5 votes vote down vote up
def _mk_char_map(mapping):
    """Transform a dictionary with comma separated uniode chracter names
    to tuples with unicode characters as key."""
    for key, value in mapping.items():
        for char in key.split(','):
            try:
                yield (unicodedata.lookup(char), value)
            except KeyError:  # pragma: no cover (does not happen on Python3)
                pass


# build mapping of Unicode characters to equivalent ASCII characters 
Example #22
Source File: test_reshape.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def test_unicode(self
                     ):  # See GH 6885 - get_dummies chokes on unicode values
        import unicodedata
        e = 'e'
        eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
        s = [e, eacute, eacute]
        res = get_dummies(s, prefix='letter', sparse=self.sparse)
        exp = DataFrame({'letter_e': {0: 1,
                                      1: 0,
                                      2: 0},
                         u('letter_%s') % eacute: {0: 0,
                                                   1: 1,
                                                   2: 1}},
                        dtype=np.uint8)
        assert_frame_equal(res, exp) 
Example #23
Source File: util.py    From odoo12-x64 with GNU General Public License v3.0 5 votes vote down vote up
def _mk_char_map(mapping):
    """Transform a dictionary with comma separated uniode chracter names
    to tuples with unicode characters as key."""
    for key, value in mapping.items():
        for char in key.split(','):
            try:
                yield (unicodedata.lookup(char), value)
            except KeyError:  # pragma: no cover (does not happen on Python3)
                pass


# build mapping of Unicode characters to equivalent ASCII characters 
Example #24
Source File: test_ucn.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown') 
Example #25
Source File: test_ucn.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def test_ascii_letters(self):
        import unicodedata

        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name) 
Example #26
Source File: test_util.py    From magic-wormhole-mailbox-server with MIT License 5 votes vote down vote up
def test_to_bytes(self):
        b = util.to_bytes("abc")
        self.assertIsInstance(b, type(b""))
        self.assertEqual(b, b"abc")

        A = unicodedata.lookup("LATIN SMALL LETTER A WITH DIAERESIS")
        b = util.to_bytes(A + "bc")
        self.assertIsInstance(b, type(b""))
        self.assertEqual(b, b"\xc3\xa4\x62\x63") 
Example #27
Source File: test_ucn.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_named_sequences_sample(self):
        # Check a few named sequences.  See #12753.
        sequences = [
            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
        ]
        for seqname, codepoints in sequences:
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname) 
Example #28
Source File: test_ucn.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_bmp_characters(self):
        for code in range(0x10000):
            char = chr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char) 
Example #29
Source File: test_ucn.py    From Project-New-Reign---Nemesis-Main with GNU General Public License v3.0 5 votes vote down vote up
def test_ascii_letters(self):
        for char in "".join(map(chr, range(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name) 
Example #30
Source File: test_ucn.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown')