Python nltk.compat.PY3() Examples

The following are code examples for showing how to use nltk.compat.PY3(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: razzy-spinner   Author: rafasashi   File: common.py    GNU General Public License v3.0 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 2
Project: razzy-spinner   Author: rafasashi   File: textcat.py    GNU General Public License v3.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 3
Project: OpenBottle   Author: xiaozhuchacha   File: common.py    MIT License 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 4
Project: OpenBottle   Author: xiaozhuchacha   File: textcat.py    MIT License 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 5
Project: OpenBottle   Author: xiaozhuchacha   File: common.py    MIT License 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 6
Project: OpenBottle   Author: xiaozhuchacha   File: textcat.py    MIT License 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 7
Project: Health-Checker   Author: KriAga   File: common.py    MIT License 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 8
Project: Health-Checker   Author: KriAga   File: textcat.py    MIT License 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 9
Project: FancyWord   Author: EastonLee   File: textcat.py    GNU General Public License v3.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 10
Project: honours_project   Author: JFriel   File: common.py    GNU General Public License v3.0 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 11
Project: honours_project   Author: JFriel   File: textcat.py    GNU General Public License v3.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 12
Project: honours_project   Author: JFriel   File: common.py    GNU General Public License v3.0 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 13
Project: honours_project   Author: JFriel   File: textcat.py    GNU General Public License v3.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 14
Project: aop-helpFinder   Author: jecarvaill   File: common.py    GNU General Public License v3.0 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 15
Project: aop-helpFinder   Author: jecarvaill   File: textcat.py    GNU General Public License v3.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 16
Project: serverless-chatbots-workshop   Author: datteswararao   File: common.py    Apache License 2.0 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 17
Project: serverless-chatbots-workshop   Author: datteswararao   File: textcat.py    Apache License 2.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 18
Project: serverless-chatbots-workshop   Author: datteswararao   File: common.py    Apache License 2.0 6 votes vote down vote up
def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
    """
    Identify appropriate CSV writer given the Python version
    """
    if compat.PY3:
        if gzip_compress:
            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
        else:
            outf = open(outfile, 'w', encoding=encoding, errors=errors)
        writer = csv.writer(outf)
    else:
        if gzip_compress:
            outf = gzip.open(outfile, 'wb')
        else:
            outf = open(outfile, 'wb')
        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
    return (writer, outf) 
Example 19
Project: serverless-chatbots-workshop   Author: datteswararao   File: textcat.py    Apache License 2.0 6 votes vote down vote up
def calc_dist(self, lang, trigram, text_profile):
        ''' Calculate the "out-of-place" measure between the
            text and language profile for a single trigram '''

        lang_fd = self._corpus.lang_freq(lang)
        dist = 0

        if trigram in lang_fd:
            idx_lang_profile = list(lang_fd.keys()).index(trigram)
            idx_text = list(text_profile.keys()).index(trigram)

            #print(idx_lang_profile, ", ", idx_text)
            dist = abs(idx_lang_profile - idx_text) 
        else:
            # Arbitrary but should be larger than
            # any possible trigram file length
            # in terms of total lines
            if PY3:
                dist = maxsize
            else:
                dist = maxint

        return dist 
Example 20
Project: razzy-spinner   Author: rafasashi   File: toolbox.py    GNU General Public License v3.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 21
Project: razzy-spinner   Author: rafasashi   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 22
Project: razzy-spinner   Author: rafasashi   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise Runtime("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 23
Project: razzy-spinner   Author: rafasashi   File: test_2x_compat.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 24
Project: razzy-spinner   Author: rafasashi   File: compat_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 25
Project: OpenBottle   Author: xiaozhuchacha   File: toolbox.py    MIT License 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 26
Project: OpenBottle   Author: xiaozhuchacha   File: crubadan.py    MIT License 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 27
Project: OpenBottle   Author: xiaozhuchacha   File: crubadan.py    MIT License 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise RuntimeError("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 28
Project: OpenBottle   Author: xiaozhuchacha   File: test_2x_compat.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 29
Project: OpenBottle   Author: xiaozhuchacha   File: compat_fixt.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 30
Project: OpenBottle   Author: xiaozhuchacha   File: toolbox.py    MIT License 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 31
Project: OpenBottle   Author: xiaozhuchacha   File: crubadan.py    MIT License 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 32
Project: OpenBottle   Author: xiaozhuchacha   File: portuguese_en_fixt.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest

    raise SkipTest("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!")

    if not PY3:
        raise SkipTest(
            "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
        ) 
Example 33
Project: OpenBottle   Author: xiaozhuchacha   File: test_2x_compat.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 34
Project: OpenBottle   Author: xiaozhuchacha   File: compat_fixt.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 35
Project: Health-Checker   Author: KriAga   File: toolbox.py    MIT License 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 36
Project: Health-Checker   Author: KriAga   File: crubadan.py    MIT License 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 37
Project: Health-Checker   Author: KriAga   File: crubadan.py    MIT License 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise RuntimeError("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 38
Project: Health-Checker   Author: KriAga   File: test_2x_compat.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 39
Project: Health-Checker   Author: KriAga   File: compat_fixt.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 40
Project: FancyWord   Author: EastonLee   File: toolbox.py    GNU General Public License v3.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 41
Project: FancyWord   Author: EastonLee   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 42
Project: FancyWord   Author: EastonLee   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise Runtime("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 43
Project: FancyWord   Author: EastonLee   File: portuguese_en_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest

    raise SkipTest("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!")

    if not PY3:
        raise SkipTest(
            "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
        ) 
Example 44
Project: FancyWord   Author: EastonLee   File: compat_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 45
Project: honours_project   Author: JFriel   File: toolbox.py    GNU General Public License v3.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 46
Project: honours_project   Author: JFriel   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 47
Project: honours_project   Author: JFriel   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise Runtime("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 48
Project: honours_project   Author: JFriel   File: test_2x_compat.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 49
Project: honours_project   Author: JFriel   File: compat_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 50
Project: honours_project   Author: JFriel   File: toolbox.py    GNU General Public License v3.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 51
Project: honours_project   Author: JFriel   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 52
Project: honours_project   Author: JFriel   File: portuguese_en_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest

    raise SkipTest("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!")

    if not PY3:
        raise SkipTest(
            "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
        ) 
Example 53
Project: honours_project   Author: JFriel   File: test_2x_compat.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 54
Project: honours_project   Author: JFriel   File: compat_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 55
Project: aop-helpFinder   Author: jecarvaill   File: toolbox.py    GNU General Public License v3.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 56
Project: aop-helpFinder   Author: jecarvaill   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 57
Project: aop-helpFinder   Author: jecarvaill   File: crubadan.py    GNU General Public License v3.0 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise RuntimeError("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 58
Project: aop-helpFinder   Author: jecarvaill   File: test_2x_compat.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 59
Project: aop-helpFinder   Author: jecarvaill   File: compat_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 60
Project: serverless-chatbots-workshop   Author: datteswararao   File: toolbox.py    Apache License 2.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 61
Project: serverless-chatbots-workshop   Author: datteswararao   File: crubadan.py    Apache License 2.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 62
Project: serverless-chatbots-workshop   Author: datteswararao   File: crubadan.py    Apache License 2.0 5 votes vote down vote up
def _load_lang_ngrams(self, lang):
        ''' Load single n-gram language file given the ISO 639-3 language code
            and return its FreqDist '''

        if lang not in self.langs():
            raise RuntimeError("Unsupported language.")

        crubadan_code = self.iso_to_crubadan(lang)
        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')

        if not path.isfile(ngram_file):
            raise Runtime("No N-gram file found for requested language.")

        counts = FreqDist()
        if PY3:
            f = open(ngram_file, 'r', encoding='utf-8')
        else:
            f = open(ngram_file, 'rU')

        for line in f:
            if PY3:
                data = line.split(' ')
            else:
                data = line.decode('utf8').split(' ')

            ngram = data[1].strip('\n')
            freq = int(data[0])
            
            counts[ngram] = freq
            
        return counts 
Example 63
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_2x_compat.py    Apache License 2.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 64
Project: serverless-chatbots-workshop   Author: datteswararao   File: compat_fixt.py    Apache License 2.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x") 
Example 65
Project: serverless-chatbots-workshop   Author: datteswararao   File: toolbox.py    Apache License 2.0 5 votes vote down vote up
def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val) 
Example 66
Project: serverless-chatbots-workshop   Author: datteswararao   File: crubadan.py    Apache License 2.0 5 votes vote down vote up
def _load_lang_mapping_data(self):
        ''' Load language mappings between codes and description from table.txt '''
        if isinstance(self.root, ZipFilePathPointer):
            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
        
        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
        if self._LANG_MAPPER_FILE not in self.fileids():
            raise RuntimeError("Could not find language mapper file: " + mapper_file)

        if PY3:
            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
        else:
            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()

        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] 
Example 67
Project: serverless-chatbots-workshop   Author: datteswararao   File: portuguese_en_fixt.py    Apache License 2.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest

    raise SkipTest("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!")

    if not PY3:
        raise SkipTest(
            "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x"
        ) 
Example 68
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_2x_compat.py    Apache License 2.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") 
Example 69
Project: serverless-chatbots-workshop   Author: datteswararao   File: compat_fixt.py    Apache License 2.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    if PY3:
        raise SkipTest("compat.doctest is for Python 2.x")