Python six.unichr() Examples
The following are 30
code examples of six.unichr().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
six
, or try the search function
.
Example #1
Source File: letter.py From hangul-toolkit with Apache License 2.0 | 6 votes |
def compose(chosung, joongsung, jongsung=u''): """This function returns a Hangul letter by composing the specified chosung, joongsung, and jongsung. @param chosung @param joongsung @param jongsung the terminal Hangul letter. This is optional if you do not need a jongsung.""" if jongsung is None: jongsung = u'' try: chosung_index = CHO.index(chosung) joongsung_index = JOONG.index(joongsung) jongsung_index = JONG.index(jongsung) except Exception: raise NotHangulException('No valid Hangul character index') return unichr(0xAC00 + chosung_index * NUM_JOONG * NUM_JONG + joongsung_index * NUM_JONG + jongsung_index)
Example #2
Source File: text_encoder.py From BERT with Apache License 2.0 | 6 votes |
def _unescape_token(escaped_token): """Inverse of _escape_token(). Args: escaped_token: a unicode string Returns: token: a unicode string """ def match(m): if m.group(1) is None: return u"_" if m.group(0) == u"\\u" else u"\\" try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: return u"\u3013" # Unicode for undefined character. trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed)
Example #3
Source File: text_encoder.py From tensor2tensor with Apache License 2.0 | 6 votes |
def _unescape_token(escaped_token): """Inverse of _escape_token(). Args: escaped_token: a unicode string Returns: token: a unicode string """ def match(m): if m.group(1) is None: return u"_" if m.group(0) == u"\\u" else u"\\" try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: return u"\u3013" # Unicode for undefined character. trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed)
Example #4
Source File: stringblock.py From Airtest with Apache License 2.0 | 6 votes |
def getRaw(self, idx): if idx < 0 or self.m_stringOffsets == [] or idx >= len(self.m_stringOffsets): return None offset = self.m_stringOffsets[ idx ].get_value() length = self.getShort(self.m_strings, offset) data = "" while length > 0: offset += 2 # get the unicode character as the apk might contain non-ASCII label data += six.unichr(self.getShort(self.m_strings, offset)) # FIXME if data[-1] == "&": data = data[:-1] length -= 1 return data
Example #5
Source File: markup.py From maple-blog with GNU General Public License v3.0 | 6 votes |
def handle_charref(self, name): """ Called when a char ref like '—' or '—' is found `name` is the char ref without ampersand and semicolon (e.g. `#8212` or `#x2014`) """ try: if name.startswith('x'): codepoint = int(name[1:], 16) else: codepoint = int(name) char = six.unichr(codepoint) except (ValueError, OverflowError): char = '' self._handle_ref('#' + name, char)
Example #6
Source File: text_encoder.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def _unescape_token(escaped_token): """Inverse of _escape_token(). Args: escaped_token: a unicode string Returns: token: a unicode string """ def match(m): if m.group(1) is None: return u"_" if m.group(0) == u"\\u" else u"\\" try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: return u"\u3013" # Unicode for undefined character. trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed)
Example #7
Source File: text_encoder.py From at16k with MIT License | 6 votes |
def _unescape_token(escaped_token): """Inverse of _escape_token(). Args: escaped_token: a unicode string Returns: token: a unicode string """ def match(m): if m.group(1) is None: return u"_" if m.group(0) == u"\\u" else u"\\" try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: return u"\u3013" # Unicode for undefined character. trimmed = escaped_token[:- 1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed)
Example #8
Source File: hangul.py From tossi with BSD 3-Clause "New" or "Revised" License | 6 votes |
def join_phonemes(*args): """Joins a Hangul letter from Korean phonemes.""" # Normalize arguments as onset, nucleus, coda. if len(args) == 1: # tuple of (onset, nucleus[, coda]) args = args[0] if len(args) == 2: args += (CODAS[0],) try: onset, nucleus, coda = args except ValueError: raise TypeError('join_phonemes() takes at most 3 arguments') offset = ( (ONSETS.index(onset) * NUM_NUCLEUSES + NUCLEUSES.index(nucleus)) * NUM_CODAS + CODAS.index(coda) ) return unichr(FIRST_HANGUL_OFFSET + offset)
Example #9
Source File: elodie_test.py From elodie with Apache License 2.0 | 6 votes |
def test_import_file_path_unicode_latin_nbsp(): temporary_folder, folder = helper.create_working_folder() temporary_folder_destination, folder_destination = helper.create_working_folder() origin = text_type(folder)+u'/unicode'+six_unichr(160)+u'filename.txt' shutil.copyfile(helper.get_file('valid.txt'), origin) helper.reset_dbs() dest_path = elodie.import_file(origin, folder_destination, False, False, False) helper.restore_dbs() shutil.rmtree(folder) shutil.rmtree(folder_destination) assert helper.path_tz_fix(os.path.join('2016-04-Apr','London',u'2016-04-07_11-15-26-unicode\xa0filename-sample-title.txt')) in dest_path, dest_path
Example #10
Source File: reader.py From ion-python with Apache License 2.0 | 6 votes |
def _narrow_unichr(code_point): """Retrieves the unicode character representing any given code point, in a way that won't break on narrow builds. This is necessary because the built-in unichr function will fail for ordinals above 0xFFFF on narrow builds (UCS2); ordinals above 0xFFFF would require recalculating and combining surrogate pairs. This avoids that by retrieving the unicode character that was initially read. Args: code_point (int|CodePoint): An int or a subclass of int that contains the unicode character representing its code point in an attribute named 'char'. """ try: if len(code_point.char) > 1: return code_point.char except AttributeError: pass return six.unichr(code_point)
Example #11
Source File: test_utils.py From masakari with Apache License 2.0 | 6 votes |
def test_exception_converted(self): self.assertRaises(exception.InvalidInput, utils.validate_integer, "im-not-an-int", "not-an-int") self.assertRaises(exception.InvalidInput, utils.validate_integer, 3.14, "Pie") self.assertRaises(exception.InvalidInput, utils.validate_integer, "299", "Sparta no-show", min_value=300, max_value=300) self.assertRaises(exception.InvalidInput, utils.validate_integer, 55, "doing 55 in a 54", max_value=54) self.assertRaises(exception.InvalidInput, utils.validate_integer, six.unichr(129), "UnicodeError", max_value=1000)
Example #12
Source File: text_encoder.py From Sequence-Semantic-Embedding with Apache License 2.0 | 6 votes |
def _unescape_token(escaped_token): """Inverse of _escape_token(). Args: escaped_token: a unicode string Returns: token: a unicode string """ def match(m): if m.group(1) is None: return u"_" if m.group(0) == u"\\u" else u"\\" try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: return "" trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed)
Example #13
Source File: test_streamexpect.py From python-streamexpect with Mozilla Public License 2.0 | 6 votes |
def test_unicode_combining_characters(self): # Some unicode characters can be represented in multiple ways - for # example, an accented character may be a single code point (with the # accent baked in), or it may be the "normal" letter with a combining # code point. See https://docs.python.org/2/library/unicodedata.html. # The points below are for a capital C with a cedilla, first as a # composite character, second as a pairing of C and the cedilla # combining character. composite = six.unichr(0xC7) combining = six.unichr(0x43) + six.unichr(0x0327) # Test combinations of search and character for text in composite, combining: searcher = TextSearcher(text) self.assertIsNotNone(searcher.search(composite)) self.assertIsNotNone(searcher.search(combining))
Example #14
Source File: elodie_test.py From elodie with Apache License 2.0 | 6 votes |
def test_import_file_path_utf8_encoded_ascii_latin_nbsp(): temporary_folder, folder = helper.create_working_folder() temporary_folder_destination, folder_destination = helper.create_working_folder() origin = text_type(folder)+u'/unicode'+six_unichr(160)+u'filename.txt' # encode the unicode string to ascii origin = origin.encode('utf-8') shutil.copyfile(helper.get_file('valid.txt'), origin) helper.reset_dbs() dest_path = elodie.import_file(origin, folder_destination, False, False, False) helper.restore_dbs() shutil.rmtree(folder) shutil.rmtree(folder_destination) assert helper.path_tz_fix(os.path.join('2016-04-Apr','London',u'2016-04-07_11-15-26-unicode\xa0filename-sample-title.txt')) in dest_path, dest_path
Example #15
Source File: _compat.py From cheroot with BSD 3-Clause "New" or "Revised" License | 6 votes |
def ntou(n, encoding='ISO-8859-1'): """Return the native string as Unicode with the given encoding.""" assert_native(n) # In Python 2, the native string type is bytes. # First, check for the special encoding 'escape'. The test suite uses # this to signal that it wants to pass a string with embedded \uXXXX # escapes, but without having to prefix it with u'' for Python 2, # but no prefix for Python 3. if encoding == 'escape': return re.sub( r'\\u([0-9a-zA-Z]{4})', lambda m: six.unichr(int(m.group(1), 16)), n.decode('ISO-8859-1'), ) # Assume it's already in the given encoding, which for ISO-8859-1 # is almost always what was intended. return n.decode(encoding)
Example #16
Source File: text_encoder.py From fine-lm with MIT License | 6 votes |
def _unescape_token(escaped_token): """Inverse of _escape_token(). Args: escaped_token: a unicode string Returns: token: a unicode string """ def match(m): if m.group(1) is None: return u"_" if m.group(0) == u"\\u" else u"\\" try: return six.unichr(int(m.group(1))) except (ValueError, OverflowError) as _: return u"\u3013" # Unicode for undefined character. trimmed = escaped_token[:-1] if escaped_token.endswith("_") else escaped_token return _UNESCAPE_REGEX.sub(match, trimmed)
Example #17
Source File: test_apirequest.py From ec2-api with Apache License 2.0 | 5 votes |
def test_render_response_utf8(self): req = apirequest.APIRequest("FakeAction", "FakeVersion", {}) resp = { 'utf8': six.unichr(40960) + u'abcd' + six.unichr(1972) } data = req._render_response(resp, 'uuid').decode() self.assertIn('<utf8>ꀀabcd޴</utf8>', data) # Tests for individual data element format functions
Example #18
Source File: parameter_types.py From masakari with Apache License 2.0 | 5 votes |
def _get_all_chars(): for i in range(0xFFFF): yield six.unichr(i) # build a regex that matches all printable characters. This allows # spaces in the middle of the name. Also note that the regexp below # deliberately allows the empty string. This is so only the constraint # which enforces a minimum length for the name is triggered when an # empty string is tested. Otherwise it is not deterministic which # constraint fails and this causes issues for some unittests when # PYTHONHASHSEED is set randomly.
Example #19
Source File: test_tokenizer.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def unescape(test): def decode(inp): """Decode \\uXXXX escapes This decodes \\uXXXX escapes, possibly into non-BMP characters when two surrogate character escapes are adjacent to each other. """ # This cannot be implemented using the unicode_escape codec # because that requires its input be ISO-8859-1, and we need # arbitrary unicode as input. def repl(m): if m.group(2) is not None: high = int(m.group(1), 16) low = int(m.group(2), 16) if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF: cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 return unichr(cp) else: return unichr(high) + unichr(low) else: return unichr(int(m.group(1), 16)) try: return _surrogateRe.sub(repl, inp) except ValueError: # This occurs when unichr throws ValueError, which should # only be for a lone-surrogate. if utils.supports_lone_surrogates: raise return None test["input"] = decode(test["input"]) for token in test["output"]: if token == "ParseError": continue else: token[1] = decode(token[1]) if len(token) > 2: for key, value in token[2]: del token[2][key] token[2][decode(key)] = decode(value) return test
Example #20
Source File: page.py From ttrv with MIT License | 5 votes |
def _prompt_period(self, order): choices = { '\n': order, '1': '{0}-hour'.format(order), '2': '{0}-day'.format(order), '3': '{0}-week'.format(order), '4': '{0}-month'.format(order), '5': '{0}-year'.format(order), '6': '{0}-all'.format(order)} message = docs.TIME_ORDER_MENU.strip().splitlines() ch = self.term.show_notification(message) ch = six.unichr(ch) return choices.get(ch)
Example #21
Source File: test_manager.py From manila with Apache License 2.0 | 5 votes |
def test_put_rados_object(self): faketext = six.unichr(246) self.mock_object(self._ceph_vol_client, 'put_object', mock.Mock(return_value=None)) ret = self._manager_with_rados_store._put_rados_object( 'fakeobj', faketext) self._ceph_vol_client.put_object.assert_called_once_with( 'fakepool', 'fakeobj', faketext.encode('utf-8')) self.assertIsNone(ret)
Example #22
Source File: test_manager.py From manila with Apache License 2.0 | 5 votes |
def test_get_rados_object(self): fakebin = six.unichr(246).encode('utf-8') self.mock_object(self._ceph_vol_client, 'get_object', mock.Mock(return_value=fakebin)) ret = self._manager_with_rados_store._get_rados_object('fakeobj') self._ceph_vol_client.get_object.assert_called_once_with( 'fakepool', 'fakeobj') self.assertEqual(fakebin.decode('utf-8'), ret)
Example #23
Source File: codec.py From polyline with MIT License | 5 votes |
def _write(self, output, curr_value, prev_value, factor): curr_value = self._py2_round(curr_value * factor) prev_value = self._py2_round(prev_value * factor) coord = curr_value - prev_value coord <<= 1 coord = coord if coord >= 0 else ~coord while coord >= 0x20: output.write(six.unichr((0x20 | (coord & 0x1f)) + 63)) coord >>= 5 output.write(six.unichr(coord + 63))
Example #24
Source File: log_processing_test.py From scalyr-agent-2 with Apache License 2.0 | 5 votes |
def test_multiple_redactions_in_line_with_hash_with_unicode(self): redactor = LogLineRedacter("/var/fake_log") redactor.add_redaction_rule("(password)", "\\H1") self._run_case( redactor, unichr(8230) + "auth=password foo=password", unichr(8230) + "auth=%s foo=%s" % (md5_hexdigest("password"), md5_hexdigest("password")), True, )
Example #25
Source File: log_processing_test.py From scalyr-agent-2 with Apache License 2.0 | 5 votes |
def test_unicode_redactions(self): redacter = LogLineRedacter("/var/fake_log") # 2->TODO there is a bugfix of 're.subn' in python3.7 and higher. # Empty matches for the pattern are replaced when adjacent to a previous non-empty match. # on python3.6 and below it works incorrect and returns "bb...bb" but it should return "bb..bbbbbb" and it does so in python3.7+ redacter.add_redaction_rule("(.+)", "bb\\1bb") # build the utf8 string utf8_string = unichr(8230) expected = "bb" + utf8_string + "bb" # go go go self._run_case(redacter, utf8_string, expected, True)
Example #26
Source File: plantuml.py From plantweb with Apache License 2.0 | 5 votes |
def _encode6bit(b): if b < 10: return unichr(48 + b) b -= 10 if b < 26: return unichr(65 + b) b -= 26 if b < 26: return unichr(97 + b) b -= 26 if b == 0: return '-' if b == 1: return '_'
Example #27
Source File: formatting.py From xmldiff with MIT License | 5 votes |
def get_placeholder(self, element, ttype, close_ph): tag = etree.tounicode(element) ph = self.tag2placeholder.get((tag, ttype, close_ph)) if ph is not None: return ph self.placeholder += 1 ph = six.unichr(self.placeholder) self.placeholder2tag[ph] = PlaceholderEntry(element, ttype, close_ph) self.tag2placeholder[tag, ttype, close_ph] = ph return ph
Example #28
Source File: compute_bleu.py From models with Apache License 2.0 | 5 votes |
def property_chars(self, prefix): return "".join(six.unichr(x) for x in range(sys.maxunicode) if unicodedata.category(six.unichr(x)).startswith(prefix))
Example #29
Source File: bleu_hook.py From tensor2tensor with Apache License 2.0 | 5 votes |
def property_chars(self, prefix): return "".join(six.unichr(x) for x in range(sys.maxunicode) if unicodedata.category(six.unichr(x)).startswith(prefix))
Example #30
Source File: ml_perf_bleu_metric.py From lingvo with Apache License 2.0 | 5 votes |
def property_chars(self, prefix): return "".join( six.unichr(x) for x in range(sys.maxunicode) if unicodedata.category(six.unichr(x)).startswith(prefix))