Python six.unichr() Examples

The following are 30 code examples of six.unichr(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module six , or try the search function .
Example #1
Source File: letter.py    From hangul-toolkit with Apache License 2.0 6 votes vote down vote up
def compose(chosung, joongsung, jongsung=u''):
    """This function returns a Hangul letter by composing the specified chosung, joongsung, and jongsung.
    @param chosung
    @param joongsung
    @param jongsung the terminal Hangul letter. This is optional if you do not need a jongsung."""

    if jongsung is None: jongsung = u''

    try:
        chosung_index = CHO.index(chosung)
        joongsung_index = JOONG.index(joongsung)
        jongsung_index = JONG.index(jongsung)
    except Exception:
        raise NotHangulException('No valid Hangul character index')

    return unichr(0xAC00 + chosung_index * NUM_JOONG * NUM_JONG + joongsung_index * NUM_JONG + jongsung_index) 
Example #2
Source File: text_encoder.py    From BERT with Apache License 2.0 6 votes vote down vote up
def _unescape_token(escaped_token):
  """Inverse of _escape_token().

  Args:
    escaped_token: a unicode string

  Returns:
    token: a unicode string
  """

  def match(m):
    if m.group(1) is None:
      return u"_" if m.group(0) == u"\\u" else u"\\"

    try:
      return six.unichr(int(m.group(1)))
    except (ValueError, OverflowError) as _:
      return u"\u3013"  # Unicode for undefined character.

  trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
  return _UNESCAPE_REGEX.sub(match, trimmed) 
Example #3
Source File: text_encoder.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def _unescape_token(escaped_token):
  """Inverse of _escape_token().

  Args:
    escaped_token: a unicode string

  Returns:
    token: a unicode string
  """

  def match(m):
    if m.group(1) is None:
      return u"_" if m.group(0) == u"\\u" else u"\\"

    try:
      return six.unichr(int(m.group(1)))
    except (ValueError, OverflowError) as _:
      return u"\u3013"  # Unicode for undefined character.

  trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
  return _UNESCAPE_REGEX.sub(match, trimmed) 
Example #4
Source File: stringblock.py    From Airtest with Apache License 2.0 6 votes vote down vote up
def getRaw(self, idx):
        if idx < 0 or self.m_stringOffsets == [] or idx >= len(self.m_stringOffsets):
            return None

        offset = self.m_stringOffsets[ idx ].get_value()
        length = self.getShort(self.m_strings, offset)

        data = ""

        while length > 0:
            offset += 2
            # get the unicode character as the apk might contain non-ASCII label
            data += six.unichr(self.getShort(self.m_strings, offset))

            # FIXME
            if data[-1] == "&":
                data = data[:-1]

            length -= 1

        return data 
Example #5
Source File: markup.py    From maple-blog with GNU General Public License v3.0 6 votes vote down vote up
def handle_charref(self, name):
        """
        Called when a char ref like '&#8212;' or '&#x2014' is found

        `name` is the char ref without ampersand and semicolon (e.g. `#8212` or
        `#x2014`)
        """
        try:
            if name.startswith('x'):
                codepoint = int(name[1:], 16)
            else:
                codepoint = int(name)
            char = six.unichr(codepoint)
        except (ValueError, OverflowError):
            char = ''
        self._handle_ref('#' + name, char) 
Example #6
Source File: text_encoder.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def _unescape_token(escaped_token):
  """Inverse of _escape_token().

  Args:
    escaped_token: a unicode string

  Returns:
    token: a unicode string
  """

  def match(m):
    if m.group(1) is None:
      return u"_" if m.group(0) == u"\\u" else u"\\"

    try:
      return six.unichr(int(m.group(1)))
    except (ValueError, OverflowError) as _:
      return u"\u3013"  # Unicode for undefined character.

  trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
  return _UNESCAPE_REGEX.sub(match, trimmed) 
Example #7
Source File: text_encoder.py    From at16k with MIT License 6 votes vote down vote up
def _unescape_token(escaped_token):
    """Inverse of _escape_token().

    Args:
        escaped_token: a unicode string

    Returns:
        token: a unicode string
    """

    def match(m):
        if m.group(1) is None:
            return u"_" if m.group(0) == u"\\u" else u"\\"

        try:
            return six.unichr(int(m.group(1)))
        except (ValueError, OverflowError) as _:
            return u"\u3013"    # Unicode for undefined character.

    trimmed = escaped_token[:-
                            1] if escaped_token.endswith("_") else escaped_token
    return _UNESCAPE_REGEX.sub(match, trimmed) 
Example #8
Source File: hangul.py    From tossi with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def join_phonemes(*args):
    """Joins a Hangul letter from Korean phonemes."""
    # Normalize arguments as onset, nucleus, coda.
    if len(args) == 1:
        # tuple of (onset, nucleus[, coda])
        args = args[0]
    if len(args) == 2:
        args += (CODAS[0],)
    try:
        onset, nucleus, coda = args
    except ValueError:
        raise TypeError('join_phonemes() takes at most 3 arguments')
    offset = (
        (ONSETS.index(onset) * NUM_NUCLEUSES + NUCLEUSES.index(nucleus)) *
        NUM_CODAS + CODAS.index(coda)
    )
    return unichr(FIRST_HANGUL_OFFSET + offset) 
Example #9
Source File: elodie_test.py    From elodie with Apache License 2.0 6 votes vote down vote up
def test_import_file_path_unicode_latin_nbsp():
    temporary_folder, folder = helper.create_working_folder()
    temporary_folder_destination, folder_destination = helper.create_working_folder()

    origin = text_type(folder)+u'/unicode'+six_unichr(160)+u'filename.txt'

    shutil.copyfile(helper.get_file('valid.txt'), origin)

    helper.reset_dbs()
    dest_path = elodie.import_file(origin, folder_destination, False, False, False)
    helper.restore_dbs()

    shutil.rmtree(folder)
    shutil.rmtree(folder_destination)

    assert helper.path_tz_fix(os.path.join('2016-04-Apr','London',u'2016-04-07_11-15-26-unicode\xa0filename-sample-title.txt')) in dest_path, dest_path 
Example #10
Source File: reader.py    From ion-python with Apache License 2.0 6 votes vote down vote up
def _narrow_unichr(code_point):
    """Retrieves the unicode character representing any given code point, in a way that won't break on narrow builds.

    This is necessary because the built-in unichr function will fail for ordinals above 0xFFFF on narrow builds (UCS2);
    ordinals above 0xFFFF would require recalculating and combining surrogate pairs. This avoids that by retrieving the
    unicode character that was initially read.

    Args:
        code_point (int|CodePoint): An int or a subclass of int that contains the unicode character representing its
            code point in an attribute named 'char'.
    """
    try:
        if len(code_point.char) > 1:
            return code_point.char
    except AttributeError:
        pass
    return six.unichr(code_point) 
Example #11
Source File: test_utils.py    From masakari with Apache License 2.0 6 votes vote down vote up
def test_exception_converted(self):
        self.assertRaises(exception.InvalidInput,
                          utils.validate_integer,
                          "im-not-an-int", "not-an-int")
        self.assertRaises(exception.InvalidInput,
                          utils.validate_integer,
                          3.14, "Pie")
        self.assertRaises(exception.InvalidInput,
                          utils.validate_integer,
                          "299", "Sparta no-show",
                          min_value=300, max_value=300)
        self.assertRaises(exception.InvalidInput,
                          utils.validate_integer,
                          55, "doing 55 in a 54",
                          max_value=54)
        self.assertRaises(exception.InvalidInput,
                          utils.validate_integer,
                          six.unichr(129), "UnicodeError",
                          max_value=1000) 
Example #12
Source File: text_encoder.py    From Sequence-Semantic-Embedding with Apache License 2.0 6 votes vote down vote up
def _unescape_token(escaped_token):
  """Inverse of _escape_token().

  Args:
    escaped_token: a unicode string

  Returns:
    token: a unicode string
  """

  def match(m):
    if m.group(1) is None:
      return u"_" if m.group(0) == u"\\u" else u"\\"

    try:
      return six.unichr(int(m.group(1)))
    except (ValueError, OverflowError) as _:
      return ""

  trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
  return _UNESCAPE_REGEX.sub(match, trimmed) 
Example #13
Source File: test_streamexpect.py    From python-streamexpect with Mozilla Public License 2.0 6 votes vote down vote up
def test_unicode_combining_characters(self):
        # Some unicode characters can be represented in multiple ways - for
        # example, an accented character may be a single code point (with the
        # accent baked in), or it may be the "normal" letter with a combining
        # code point. See https://docs.python.org/2/library/unicodedata.html.
        # The points below are for a capital C with a cedilla, first as a
        # composite character, second as a pairing of C and the cedilla
        # combining character.
        composite = six.unichr(0xC7)
        combining = six.unichr(0x43) + six.unichr(0x0327)

        # Test combinations of search and character
        for text in composite, combining:
            searcher = TextSearcher(text)
            self.assertIsNotNone(searcher.search(composite))
            self.assertIsNotNone(searcher.search(combining)) 
Example #14
Source File: elodie_test.py    From elodie with Apache License 2.0 6 votes vote down vote up
def test_import_file_path_utf8_encoded_ascii_latin_nbsp():
    temporary_folder, folder = helper.create_working_folder()
    temporary_folder_destination, folder_destination = helper.create_working_folder()

    origin = text_type(folder)+u'/unicode'+six_unichr(160)+u'filename.txt'
    # encode the unicode string to ascii
    origin = origin.encode('utf-8')

    shutil.copyfile(helper.get_file('valid.txt'), origin)

    helper.reset_dbs()
    dest_path = elodie.import_file(origin, folder_destination, False, False, False)
    helper.restore_dbs()

    shutil.rmtree(folder)
    shutil.rmtree(folder_destination)

    assert helper.path_tz_fix(os.path.join('2016-04-Apr','London',u'2016-04-07_11-15-26-unicode\xa0filename-sample-title.txt')) in dest_path, dest_path 
Example #15
Source File: _compat.py    From cheroot with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def ntou(n, encoding='ISO-8859-1'):
        """Return the native string as Unicode with the given encoding."""
        assert_native(n)
        # In Python 2, the native string type is bytes.
        # First, check for the special encoding 'escape'. The test suite uses
        # this to signal that it wants to pass a string with embedded \uXXXX
        # escapes, but without having to prefix it with u'' for Python 2,
        # but no prefix for Python 3.
        if encoding == 'escape':
            return re.sub(
                r'\\u([0-9a-zA-Z]{4})',
                lambda m: six.unichr(int(m.group(1), 16)),
                n.decode('ISO-8859-1'),
            )
        # Assume it's already in the given encoding, which for ISO-8859-1
        # is almost always what was intended.
        return n.decode(encoding) 
Example #16
Source File: text_encoder.py    From fine-lm with MIT License 6 votes vote down vote up
def _unescape_token(escaped_token):
  """Inverse of _escape_token().

  Args:
    escaped_token: a unicode string

  Returns:
    token: a unicode string
  """

  def match(m):
    if m.group(1) is None:
      return u"_" if m.group(0) == u"\\u" else u"\\"

    try:
      return six.unichr(int(m.group(1)))
    except (ValueError, OverflowError) as _:
      return u"\u3013"  # Unicode for undefined character.

  trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token
  return _UNESCAPE_REGEX.sub(match, trimmed) 
Example #17
Source File: test_apirequest.py    From ec2-api with Apache License 2.0 5 votes vote down vote up
def test_render_response_utf8(self):
        req = apirequest.APIRequest("FakeAction", "FakeVersion", {})
        resp = {
            'utf8': six.unichr(40960) + u'abcd' + six.unichr(1972)
        }
        data = req._render_response(resp, 'uuid').decode()
        self.assertIn('<utf8>&#40960;abcd&#1972;</utf8>', data)

    # Tests for individual data element format functions 
Example #18
Source File: parameter_types.py    From masakari with Apache License 2.0 5 votes vote down vote up
def _get_all_chars():
    for i in range(0xFFFF):
        yield six.unichr(i)


# build a regex that matches all printable characters. This allows
# spaces in the middle of the name. Also note that the regexp below
# deliberately allows the empty string. This is so only the constraint
# which enforces a minimum length for the name is triggered when an
# empty string is tested. Otherwise it is not deterministic which
# constraint fails and this causes issues for some unittests when
# PYTHONHASHSEED is set randomly. 
Example #19
Source File: test_tokenizer.py    From MARA_Framework with GNU Lesser General Public License v3.0 5 votes vote down vote up
def unescape(test):
    def decode(inp):
        """Decode \\uXXXX escapes

        This decodes \\uXXXX escapes, possibly into non-BMP characters when
        two surrogate character escapes are adjacent to each other.
        """
        # This cannot be implemented using the unicode_escape codec
        # because that requires its input be ISO-8859-1, and we need
        # arbitrary unicode as input.
        def repl(m):
            if m.group(2) is not None:
                high = int(m.group(1), 16)
                low = int(m.group(2), 16)
                if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
                    return unichr(cp)
                else:
                    return unichr(high) + unichr(low)
            else:
                return unichr(int(m.group(1), 16))
        try:
            return _surrogateRe.sub(repl, inp)
        except ValueError:
            # This occurs when unichr throws ValueError, which should
            # only be for a lone-surrogate.
            if utils.supports_lone_surrogates:
                raise
            return None

    test["input"] = decode(test["input"])
    for token in test["output"]:
        if token == "ParseError":
            continue
        else:
            token[1] = decode(token[1])
            if len(token) > 2:
                for key, value in token[2]:
                    del token[2][key]
                    token[2][decode(key)] = decode(value)
    return test 
Example #20
Source File: page.py    From ttrv with MIT License 5 votes vote down vote up
def _prompt_period(self, order):
        choices = {
            '\n': order,
            '1': '{0}-hour'.format(order),
            '2': '{0}-day'.format(order),
            '3': '{0}-week'.format(order),
            '4': '{0}-month'.format(order),
            '5': '{0}-year'.format(order),
            '6': '{0}-all'.format(order)}

        message = docs.TIME_ORDER_MENU.strip().splitlines()
        ch = self.term.show_notification(message)
        ch = six.unichr(ch)
        return choices.get(ch) 
Example #21
Source File: test_manager.py    From manila with Apache License 2.0 5 votes vote down vote up
def test_put_rados_object(self):
        faketext = six.unichr(246)
        self.mock_object(self._ceph_vol_client, 'put_object',
                         mock.Mock(return_value=None))

        ret = self._manager_with_rados_store._put_rados_object(
            'fakeobj', faketext)

        self._ceph_vol_client.put_object.assert_called_once_with(
            'fakepool', 'fakeobj', faketext.encode('utf-8'))
        self.assertIsNone(ret) 
Example #22
Source File: test_manager.py    From manila with Apache License 2.0 5 votes vote down vote up
def test_get_rados_object(self):
        fakebin = six.unichr(246).encode('utf-8')
        self.mock_object(self._ceph_vol_client, 'get_object',
                         mock.Mock(return_value=fakebin))

        ret = self._manager_with_rados_store._get_rados_object('fakeobj')

        self._ceph_vol_client.get_object.assert_called_once_with(
            'fakepool', 'fakeobj')
        self.assertEqual(fakebin.decode('utf-8'), ret) 
Example #23
Source File: codec.py    From polyline with MIT License 5 votes vote down vote up
def _write(self, output, curr_value, prev_value, factor):
        curr_value = self._py2_round(curr_value * factor)
        prev_value = self._py2_round(prev_value * factor)
        coord = curr_value - prev_value
        coord <<= 1
        coord = coord if coord >= 0 else ~coord

        while coord >= 0x20:
            output.write(six.unichr((0x20 | (coord & 0x1f)) + 63))
            coord >>= 5

        output.write(six.unichr(coord + 63)) 
Example #24
Source File: log_processing_test.py    From scalyr-agent-2 with Apache License 2.0 5 votes vote down vote up
def test_multiple_redactions_in_line_with_hash_with_unicode(self):
        redactor = LogLineRedacter("/var/fake_log")
        redactor.add_redaction_rule("(password)", "\\H1")

        self._run_case(
            redactor,
            unichr(8230) + "auth=password foo=password",
            unichr(8230)
            + "auth=%s foo=%s" % (md5_hexdigest("password"), md5_hexdigest("password")),
            True,
        ) 
Example #25
Source File: log_processing_test.py    From scalyr-agent-2 with Apache License 2.0 5 votes vote down vote up
def test_unicode_redactions(self):
        redacter = LogLineRedacter("/var/fake_log")
        # 2->TODO there is a bugfix of 're.subn' in  python3.7 and higher.
        # Empty matches for the pattern are replaced when adjacent to a previous non-empty match.
        # on python3.6 and below it works incorrect and returns "bb...bb" but it should return "bb..bbbbbb" and it does so in python3.7+
        redacter.add_redaction_rule("(.+)", "bb\\1bb")

        # build the utf8 string
        utf8_string = unichr(8230)
        expected = "bb" + utf8_string + "bb"

        # go go go
        self._run_case(redacter, utf8_string, expected, True) 
Example #26
Source File: plantuml.py    From plantweb with Apache License 2.0 5 votes vote down vote up
def _encode6bit(b):
    if b < 10:
        return unichr(48 + b)
    b -= 10
    if b < 26:
        return unichr(65 + b)
    b -= 26
    if b < 26:
        return unichr(97 + b)
    b -= 26
    if b == 0:
        return '-'
    if b == 1:
        return '_' 
Example #27
Source File: formatting.py    From xmldiff with MIT License 5 votes vote down vote up
def get_placeholder(self, element, ttype, close_ph):
        tag = etree.tounicode(element)
        ph = self.tag2placeholder.get((tag, ttype, close_ph))
        if ph is not None:
            return ph

        self.placeholder += 1
        ph = six.unichr(self.placeholder)
        self.placeholder2tag[ph] = PlaceholderEntry(element, ttype, close_ph)
        self.tag2placeholder[tag, ttype, close_ph] = ph
        return ph 
Example #28
Source File: compute_bleu.py    From models with Apache License 2.0 5 votes vote down vote up
def property_chars(self, prefix):
    return "".join(six.unichr(x) for x in range(sys.maxunicode)
                   if unicodedata.category(six.unichr(x)).startswith(prefix)) 
Example #29
Source File: bleu_hook.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def property_chars(self, prefix):
    return "".join(six.unichr(x) for x in range(sys.maxunicode)
                   if unicodedata.category(six.unichr(x)).startswith(prefix)) 
Example #30
Source File: ml_perf_bleu_metric.py    From lingvo with Apache License 2.0 5 votes vote down vote up
def property_chars(self, prefix):
    return "".join(
        six.unichr(x)
        for x in range(sys.maxunicode)
        if unicodedata.category(six.unichr(x)).startswith(prefix))