Python six.u() Examples

The following are 30 code examples of six.u(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module six , or try the search function .
Example #1
Source File: colorize.py    From DRL_DeliveryDuel with MIT License 6 votes vote down vote up
def colorize(string, color, bold=False, highlight = False):
    """Return string surrounded by appropriate terminal color codes to
    print colorized text.  Valid colors: gray, red, green, yellow,
    blue, magenta, cyan, white, crimson
    """

    # Import six here so that `utils` has no import-time dependencies.
    # We want this since we use `utils` during our import-time sanity checks
    # that verify that our dependencies (including six) are actually present.
    import six

    attr = []
    num = color2num[color]
    if highlight: num += 10
    attr.append(six.u(str(num)))
    if bold: attr.append(six.u('1'))
    attrs = six.u(';').join(attr)
    return six.u('\x1b[%sm%s\x1b[0m') % (attrs, string) 
Example #2
Source File: word2vecReaderUtils.py    From word2vec-twitter with MIT License 6 votes vote down vote up
def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False):
    """
    Iteratively yield tokens as unicode strings, optionally also lowercasing them
    and removing accent marks.

    Input text may be either unicode or utf8-encoded byte string.

    The tokens on output are maximal contiguous sequences of alphabetic
    characters (no digits!).

    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']

    """
    lowercase = lowercase or to_lower or lower
    text = to_unicode(text, errors=errors)
    if lowercase:
        text = text.lower()
    if deacc:
        text = deaccent(text)
    for match in PAT_ALPHABETIC.finditer(text):
        yield match.group() 
Example #3
Source File: word2vecReaderUtils.py    From word2vec-twitter with MIT License 6 votes vote down vote up
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
Example #4
Source File: util.py    From articles with MIT License 6 votes vote down vote up
def _deserialize_primitive(data, klass):
    """Deserializes to primitive type.

    :param data: data to deserialize.
    :param klass: class literal.

    :return: int, long, float, str, bool.
    :rtype: int | long | float | str | bool
    """
    try:
        value = klass(data)
    except UnicodeEncodeError:
        value = six.u(data)
    except TypeError:
        value = data
    return value 
Example #5
Source File: html.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
        encoding=None):
    """Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\\n``, ``\\t``, ``\\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    """

    text = to_unicode(text, encoding)
    for ec in which_ones:
        text = text.replace(ec, to_unicode(replace_by, encoding))
    return text 
Example #6
Source File: html.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def remove_tags_with_content(text, which_ones=(), encoding=None):
    """Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    u'<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    """

    text = to_unicode(text, encoding)
    if which_ones:
        tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
        text = retags.sub(u'', text)
    return text 
Example #7
Source File: tests.py    From python-opencage-geocoder with MIT License 6 votes vote down vote up
def testDonostia(self):
        httpretty.register_uri(
            httpretty.GET,
            self.geocoder.url,
            body='{"thanks":"For using an OpenCage Data API","status":{"message":"OK","code":200},"rate":{"remaining":2482,"limit":"2500","reset":1402185600},"total_results":7,"results":[{"geometry":{"lat":"43.3213324","lng":"-1.9856227"},"annotations":{},"components":{"postcode":"20001;20002;20003;20004;20005;20006;20007;20008;20009;20010;20011;20012;20013;20014;20015;20016;20017;20018","county":"Donostialdea/Donostia-San Sebasti\u00e1n","state":"Basque Country","country":"Spain","city":"San Sebasti\u00e1n","country_code":"es"},"formatted":"San Sebasti\u00e1n, Donostialdea/Donostia-San Sebasti\u00e1n, 20001;20002;20003;20004;20005;20006;20007;20008;20009;20010;20011;20012;20013;20014;20015;20016;20017;20018, Basque Country, Spain, es","bounds":{"southwest":{"lat":"43.2178373","lng":"-2.086808"},"northeast":{"lng":"-1.8878838","lat":"43.3381344"}}},{"formatted":"Donostia, Irun, Bidasoa Beherea / Bajo Bidasoa, Basque Country, Spain, es","components":{"county":"Bidasoa Beherea / Bajo Bidasoa","state":"Basque Country","country":"Spain","city":"Irun","country_code":"es","road":"Donostia"},"bounds":{"southwest":{"lat":"43.3422299","lng":"-1.8022744"},"northeast":{"lng":"-1.8013452","lat":"43.3449598"}},"geometry":{"lng":"-1.8019153","lat":"43.3432784"},"annotations":{}},{"annotations":{},"geometry":{"lng":"-1.8022744","lat":"43.3422299"},"formatted":"Donostia, Anaka, Irun, Bidasoa Beherea / Bajo Bidasoa, Basque Country, Spain, es","components":{"county":"Bidasoa Beherea / Bajo Bidasoa","state":"Basque Country","country":"Spain","city":"Irun","suburb":"Anaka","country_code":"es","road":"Donostia"},"bounds":{"southwest":{"lng":"-1.8022971","lat":"43.3421635"},"northeast":{"lng":"-1.8022744","lat":"43.3422299"}}},{"geometry":{"lng":"-2.69312049872164","lat":"42.868297"},"annotations":{},"bounds":{"southwest":{"lng":"-2.6933154","lat":"42.8681484"},"northeast":{"lat":"42.8684357","lng":"-2.6929252"}},"formatted":"Donostia kalea, Ibaiondo, Vitoria-Gasteiz, Vitoria-Gasteizko Eskualdea / Cuadrilla de Vitoria, Basque Country, Spain, es","components":{"county":"Vitoria-Gasteizko Eskualdea / Cuadrilla de Vitoria","state":"Basque Country","country":"Spain","city":"Vitoria-Gasteiz","suburb":"Ibaiondo","country_code":"es","road":"Donostia kalea"}},{"bounds":{"southwest":{"lng":"-2.6889534","lat":"42.8620967"},"northeast":{"lat":"42.8623764","lng":"-2.6885774"}},"formatted":"Donostia kalea, Lakua, Vitoria-Gasteiz, Vitoria-Gasteizko Eskualdea / Cuadrilla de Vitoria, Basque Country, Spain, es","components":{"county":"Vitoria-Gasteizko Eskualdea / Cuadrilla de Vitoria","state":"Basque Country","country":"Spain","city":"Vitoria-Gasteiz","suburb":"Lakua","country_code":"es","road":"Donostia kalea"},"geometry":{"lat":"42.8622515","lng":"-2.68876582144679"},"annotations":{}},{"annotations":{},"geometry":{"lat":"51.5146888","lng":"-0.1609307"},"components":{"restaurant":"Donostia","country":"United Kingdom","state_district":"Greater London","country_code":"gb","county":"London","state":"England","suburb":"Marylebone","city":"City of Westminster","road":"Great Cumberland Mews"},"formatted":"Donostia, Great Cumberland Mews, Marylebone, City of Westminster, London, Greater London, England, United Kingdom, gb","bounds":{"northeast":{"lng":"-0.1608807","lat":"51.5147388"},"southwest":{"lat":"51.5146388","lng":"-0.1609807"}}},{"geometry":{"lat":43.31283,"lng":-1.97499},"annotations":{},"bounds":{"northeast":{"lng":"-1.92020404339","lat":"43.3401603699"},"southwest":{"lat":"43.2644081116","lng":"-2.04920697212"}},"formatted":"San Sebastian, Gipuzkoa, Basque Country, Spain, Donostia / San Sebasti\u00e1n","components":{"county":"Gipuzkoa","state":"Basque Country","country":"Spain","town":"San Sebastian","local administrative area":"Donostia / San Sebasti\u00e1n"}}],"timestamp":{"created_unix":1402136556,"created_http":"Sat, 07 Jun 2014 10:22:36 GMT"},"licenses":[{"name":"CC-BY-SA","url":"http://creativecommons.org/licenses/by-sa/3.0/"},{"name":"ODbL","url":"http://opendatacommons.org/licenses/odbl/summary/"}]}',

        )

        results = self.geocoder.geocode("Donostia")
        self.assertTrue(
            any((abs(result['geometry']['lat'] - 43.300836) < 0.05 and abs(result['geometry']['lng'] - -1.9809529) < 0.05) for result in results),
            msg="Bad result"
        )

        # test that the results are in unicode
        self.assertEqual(results[0]['formatted'], six.u('San Sebasti\xe1n, Donostialdea/Donostia-San Sebasti\xe1n, 20001;20002;20003;20004;20005;20006;20007;20008;20009;20010;20011;20012;20013;20014;20015;20016;20017;20018, Basque Country, Spain, es')) 
Example #8
Source File: html.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def replace_tags(text, token='', encoding=None):
    """Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
    u'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
    u' -- Je ne parle pas  -- fran\\xe7ais --  -- '
    >>>

    """

    return _tag_re.sub(token, to_unicode(text, encoding)) 
Example #9
Source File: utils.py    From pipenv with MIT License 6 votes vote down vote up
def prepare_constraint_file(self):
        from pipenv.vendor.vistir.path import create_tracked_tempfile
        constraints_file = create_tracked_tempfile(
            mode="w",
            prefix="pipenv-",
            suffix="-constraints.txt",
            dir=self.req_dir,
            delete=False,
        )
        skip_args = ("build-isolation", "use-pep517", "cache-dir")
        args_to_add = [
            arg for arg in self.pip_args
            if not any(bad_arg in arg for bad_arg in skip_args)
        ]
        if self.sources:
            requirementstxt_sources = " ".join(args_to_add) if args_to_add else ""
            requirementstxt_sources = requirementstxt_sources.replace(" --", "\n--")
            constraints_file.write(u"{0}\n".format(requirementstxt_sources))
        constraints = self.initial_constraints
        constraints_file.write(u"\n".join([c for c in constraints]))
        constraints_file.close()
        return constraints_file.name 
Example #10
Source File: util.py    From articles with MIT License 6 votes vote down vote up
def _deserialize_primitive(data, klass):
    """Deserializes to primitive type.

    :param data: data to deserialize.
    :param klass: class literal.

    :return: int, long, float, str, bool.
    :rtype: int | long | float | str | bool
    """
    try:
        value = klass(data)
    except UnicodeEncodeError:
        value = six.u(data)
    except TypeError:
        value = data
    return value 
Example #11
Source File: utils.py    From Synonyms with MIT License 6 votes vote down vote up
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
Example #12
Source File: utils.py    From vk-requests with MIT License 6 votes vote down vote up
def parse_masked_phone_number(html, parser=None):
    """Get masked phone number from security check html

    :param html: str: raw html text
    :param parser: bs4.BeautifulSoup: html parser
    :return: tuple of phone prefix and suffix, for example: ('+1234', '89')
    :rtype : tuple
    """
    if parser is None:
        parser = bs4.BeautifulSoup(html, 'html.parser')

    fields = parser.find_all('span', {'class': 'field_prefix'})
    if not fields:
        raise VkParseError(
            'No <span class="field_prefix">...</span> in the \n%s' % html)

    result = []
    for f in fields:
        value = f.get_text().replace(six.u('\xa0'), '')
        result.append(value)
    return tuple(result) 
Example #13
Source File: contrasts.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test__obj_to_readable_str():
    def t(obj, expected):
        got = _obj_to_readable_str(obj)
        assert type(got) is str
        assert got == expected
    t(1, "1")
    t(1.0, "1.0")
    t("asdf", "asdf")
    t(six.u("asdf"), "asdf")
    if sys.version_info >= (3,):
        # we can use "foo".encode here b/c this is python 3!
        # a utf-8 encoded euro-sign comes out as a real euro sign.
        t("\u20ac".encode("utf-8"), six.u("\u20ac"))
        # but a iso-8859-15 euro sign can't be decoded, and we fall back on
        # repr()
        t("\u20ac".encode("iso-8859-15"), "b'\\xa4'")
    else:
        t(six.u("\u20ac"), "u'\\u20ac'") 
Example #14
Source File: test_sci.py    From python-devicecloud with Mozilla Public License 2.0 6 votes vote down vote up
def test_sci_update_firmware_attribute(self):

        self._prepare_sci_response(EXAMPLE_UPDATE_FIRMWARE_INVALID_ATTRIBUTE_RESPONSE)
        self.dc.get_sci_api().send_sci(
            operation="update_firmware",
            attribute="filename=\"abcd.bin\"",
            target=DeviceTarget('00000000-00000000-00409dff-ffaabbcc'),
            payload=EXAMPLE_UPDATE_FIRMWARE_INVALID_ATTRIBUTE_REQUEST_PAYLOAD)

        request = httpretty.last_request().body.decode('utf8')
        request = ''.join([line.strip() for line in request.splitlines()])
        self.assertEqual(request,
                         six.u('<sci_request version="1.0">'
                               '<update_firmware filename="abcd.bin">'
                               '<targets>'
                               '<device id="00000000-00000000-00409dff-ffaabbcc"/>'
                               '</targets>'
                               '<data>aHNxcAbAADUct1cAAACAHEBAAAEABEAwAIBAAQAAACOFFzU</data>'
                               '</update_firmware>'
                               '</sci_request>')) 
Example #15
Source File: test_sci.py    From python-devicecloud with Mozilla Public License 2.0 6 votes vote down vote up
def test_sci_no_parameters(self):
        self._prepare_sci_response(EXAMPLE_SCI_REQUEST_RESPONSE)
        self.dc.get_sci_api().send_sci(
            operation="send_message",
            target=DeviceTarget('00000000-00000000-00409dff-ffaabbcc'),
            payload=EXAMPLE_SCI_REQUEST_PAYLOAD)
        request = httpretty.last_request().body.decode('utf8')
        # Strip white space from lines and concatenate request
        request = ''.join([line.strip() for line in request.splitlines()])
        self.assertEqual(request,
                         six.u('<sci_request version="1.0">'
                               '<send_message>'
                               '<targets>'
                               '<device id="00000000-00000000-00409dff-ffaabbcc"/>'
                               '</targets>'
                               '<rci_request version="1.1">'
                               '<query_state>'
                               '<device_stats/>'
                               '</query_state>'
                               '</rci_request>'
                               '</send_message>'
                               '</sci_request>')) 
Example #16
Source File: test_annotator.py    From python-stanford-corenlp with MIT License 6 votes vote down vote up
def test_tokenizer():
    cases = [(u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
              u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()),
             (u"HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
              u"html entities and other web oddities can be an ácute".split() + [u"<em class='grumpy'>", u"pain", u"</em>", u">:("]),
             (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
              u"it's perhaps noteworthy that phone numbers like".split() + [u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567"] + u"are treated as words despite their whitespace .".split())
            ]

    annotator = HappyFunTokenizer()
    annotator.start()

    try:
        with corenlp.CoreNLPClient(properties=annotator.properties, annotators="happyfun ssplit pos".split()) as client:
            for text, tokens in cases:
                ann = client.annotate(text)
                tokens_ = [t.word for t in ann.sentence[0].token]
                assert tokens == tokens_
    finally:
        annotator.terminate()
        annotator.join() 
Example #17
Source File: serde_weights.py    From ngraph-python with Apache License 2.0 6 votes vote down vote up
def write_np_values(values, f):
    """
    Arguments:
        values: {str: np.array}
        f: filename or filelike object
    """
    with ZipFile(f, 'w') as zf:
        for k, v in values.items():
            # Need to do this because Python zipfile has some odd support for filenames:
            # http://bugs.python.org/issue24110
            if len(k) == 16 and isinstance(k, six.binary_type):  # valid UUID bytes
                zf.writestr(str(uuid.UUID(bytes=k)), v.tostring())
            else:
                zf.writestr(six.u(k), v.tostring())

        zf.writestr(MANIFEST_FILENAME, json_dumps_manifest(values)) 
Example #18
Source File: utils.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
Example #19
Source File: utils.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False):
    """
    Iteratively yield tokens as unicode strings, optionally also lowercasing them
    and removing accent marks.

    Input text may be either unicode or utf8-encoded byte string.

    The tokens on output are maximal contiguous sequences of alphabetic
    characters (no digits!).

    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']

    """
    lowercase = lowercase or to_lower or lower
    text = to_unicode(text, errors=errors)
    if lowercase:
        text = text.lower()
    if deacc:
        text = deaccent(text)
    for match in PAT_ALPHABETIC.finditer(text):
        yield match.group() 
Example #20
Source File: test_annotator.py    From python-stanford-corenlp with MIT License 6 votes vote down vote up
def test_annotator_annotate():
    cases = [(u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
              u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split()),
             (u"HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
              u"html entities and other web oddities can be an ácute".split() + [u"<em class='grumpy'>", u"pain", u"</em>", u">:("]),
             (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
              u"it's perhaps noteworthy that phone numbers like".split() + [u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567"] + u"are treated as words despite their whitespace .".split())
            ]

    annotator = HappyFunTokenizer()

    for text, tokens in cases:
        ann = corenlp.Document()
        ann.text = text
        annotator.annotate(ann)
        tokens_ = [t.word for t in ann.sentencelessToken]
        assert tokens_ == tokens 
Example #21
Source File: test_annotator.py    From python-stanford-corenlp with MIT License 6 votes vote down vote up
def annotate(self, ann):
        """
        @ann: is a protobuf annotation object.
        Actually populate @ann with tokens.
        """
        buf, beg_idx, end_idx = ann.text.lower(), 0, 0
        for i, word in enumerate(self.tokenize(ann.text)):
            token = ann.sentencelessToken.add()
            # These are the bare minimum required for the TokenAnnotation
            token.word = six.u(word)
            token.tokenBeginIndex = i
            token.tokenEndIndex = i+1

            # Seek into the txt until you can find this word.
            try:
                # Try to update beginning index
                beg_idx = buf.index(word, beg_idx)
            except ValueError:
                # Give up -- this will be something random
                end_idx = beg_idx + len(word)

            token.beginChar = beg_idx
            token.endChar = end_idx

            beg_idx, end_idx = end_idx, end_idx 
Example #22
Source File: utils.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
Example #23
Source File: utils.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False, lower=False):
    """
    Iteratively yield tokens as unicode strings, optionally also lowercasing them
    and removing accent marks.

    Input text may be either unicode or utf8-encoded byte string.

    The tokens on output are maximal contiguous sequences of alphabetic
    characters (no digits!).

    >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
    [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']

    """
    lowercase = lowercase or to_lower or lower
    text = to_unicode(text, errors=errors)
    if lowercase:
        text = text.lower()
    if deacc:
        text = deaccent(text)
    for match in PAT_ALPHABETIC.finditer(text):
        yield match.group() 
Example #24
Source File: utils.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result) 
Example #25
Source File: easy_install.py    From Safejumper-for-Desktop with GNU General Public License v2.0 5 votes vote down vote up
def _tmpdir(self):
        tmpdir = tempfile.mkdtemp(prefix=six.u("easy_install-"))
        try:
            # cast to str as workaround for #709 and #710 and #712
            yield str(tmpdir)
        finally:
            os.path.exists(tmpdir) and rmtree(rmtree_safe(tmpdir)) 
Example #26
Source File: utils.py    From pipenv with MIT License 5 votes vote down vote up
def clean_pkg_version(version):
    """Uses pip to prepare a package version string, from our internal version."""
    return six.u(pep440_version(str(version).replace("==", ""))) 
Example #27
Source File: test_websocket.py    From deepWordBug with Apache License 2.0 5 votes vote down vote up
def testWSKey(self):
        key = _create_sec_websocket_key()
        self.assertTrue(key != 24)
        self.assertTrue(six.u("¥n") not in key) 
Example #28
Source File: test_deployer.py    From k8s with Apache License 2.0 5 votes vote down vote up
def test_create_blank_deployment(self):
        object_meta = ObjectMeta(name=NAME, namespace=NAMESPACE)
        deployment = Deployment(metadata=object_meta)
        assert deployment.as_dict()[u"metadata"][u"name"] == NAME 
Example #29
Source File: test_classic_plugin.py    From eyeD3 with GNU General Public License v3.0 5 votes vote down vote up
def test_lyrics(audiofile, tmpdir, eyeD3):
    lyrics_files = []
    for i in range(1, 4):
        lfile = tmpdir / "lryics{:d}".format(i)
        lfile.write_text((six.u(str(i)) * (100 * i)), "utf8")
        lyrics_files.append(lfile)

    audiofile = eyeD3(audiofile,
                      ["--add-lyrics", "{}".format(lyrics_files[0]),
                        "--add-lyrics", "{}:desc".format(lyrics_files[1]),
                        "--add-lyrics", "{}:foo:en".format(lyrics_files[1]),
                        "--add-lyrics", "{}:foo:es".format(lyrics_files[2]),
                        "--add-lyrics", "{}:foo:de".format(lyrics_files[0]),
                       ])
    assert len(audiofile.tag.lyrics) == 5
    assert audiofile.tag.lyrics.get("").text == ("1" * 100)
    assert audiofile.tag.lyrics.get("desc").text == ("2" * 200)
    assert audiofile.tag.lyrics.get("foo", "en").text == ("2" * 200)
    assert audiofile.tag.lyrics.get("foo", "es").text == ("3" * 300)
    assert audiofile.tag.lyrics.get("foo", "de").text == ("1" * 100)

    audiofile = eyeD3(audiofile, ["--remove-lyrics", "foo:xxx"])
    assert len(audiofile.tag.lyrics) == 5

    audiofile = eyeD3(audiofile, ["--remove-lyrics", "foo:es"])
    assert len(audiofile.tag.lyrics) == 4

    audiofile = eyeD3(audiofile, ["--remove-lyrics", "desc"])
    assert len(audiofile.tag.lyrics) == 3

    audiofile = eyeD3(audiofile, ["--remove-all-lyrics"])
    assert len(audiofile.tag.lyrics) == 0

    eyeD3(audiofile, ["--add-lyrics", "eminem.txt"], expected_retval=2) 
Example #30
Source File: test_websocket.py    From deepWordBug with Apache License 2.0 5 votes vote down vote up
def testSecureWebSocket(self):
        if 1:
            import ssl
            s = ws.create_connection("wss://echo.websocket.org/")
            self.assertNotEqual(s, None)
            self.assertTrue(isinstance(s.sock, ssl.SSLSocket))
            s.send("Hello, World")
            result = s.recv()
            self.assertEqual(result, "Hello, World")
            s.send(u"こにゃにゃちは、世界")
            result = s.recv()
            self.assertEqual(result, "こにゃにゃちは、世界")
            s.close()
        #except:
        #    pass