Python gensim.utils.to_unicode() Examples

The following are 30 code examples of gensim.utils.to_unicode(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.utils , or try the search function .
Example #1
Source File: ldamallet.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15) 
Example #2
Source File: dictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_from_text(fname):
        """
        Load a previously stored Dictionary from a text file.
        Mirror function to `save_as_text`.
        """
        result = Dictionary()
        with utils.smart_open(fname) as f:
            for lineno, line in enumerate(f):
                line = utils.to_unicode(line)
                try:
                    wordid, word, docfreq = line[:-1].split('\t')
                except Exception:
                    raise ValueError("invalid line in dictionary file %s: %s"
                                     % (fname, line.strip()))
                wordid = int(wordid)
                if word in result.token2id:
                    raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
                result.token2id[word] = wordid
                result.dfs[wordid] = int(docfreq)
        return result 
Example #3
Source File: test_lee.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = lambda line: utils.to_unicode(line, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)] 
Example #4
Source File: test_lee.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = lambda line: utils.to_unicode(line, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)] 
Example #5
Source File: dump.py    From embedding with MIT License 6 votes vote down vote up
def tokenize(content, token_min_len=2, token_max_len=100, lower=True):
    content = re.sub(EMAIL_PATTERN, ' ', content)  # remove email pattern
    content = re.sub(URL_PATTERN, ' ', content) # remove url pattern
    content = re.sub(WIKI_REMOVE_CHARS, ' ', content)  # remove unnecessary chars
    content = re.sub(WIKI_SPACE_CHARS, ' ', content)
    content = re.sub(MULTIPLE_SPACES, ' ', content)
    tokens = content.replace(", )", "").split(" ")
    result = []
    for token in tokens:
        if not token.startswith('_'):
            token_candidate = to_unicode(re.sub(WIKI_REMOVE_TOKEN_CHARS, '', token))
        else:
            token_candidate = ""
        if len(token_candidate) > 0:
            result.append(token_candidate)
    return result 
Example #6
Source File: dictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_from_text(fname):
        """
        Load a previously stored Dictionary from a text file.
        Mirror function to `save_as_text`.
        """
        result = Dictionary()
        with utils.smart_open(fname) as f:
            for lineno, line in enumerate(f):
                line = utils.to_unicode(line)
                try:
                    wordid, word, docfreq = line[:-1].split('\t')
                except Exception:
                    raise ValueError("invalid line in dictionary file %s: %s"
                                     % (fname, line.strip()))
                wordid = int(wordid)
                if word in result.token2id:
                    raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
                result.token2id[word] = wordid
                result.dfs[wordid] = int(docfreq)
        return result 
Example #7
Source File: ldamallet.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15) 
Example #8
Source File: test_lee.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = lambda line: utils.to_unicode(line, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)] 
Example #9
Source File: dictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_from_text(fname):
        """
        Load a previously stored Dictionary from a text file.
        Mirror function to `save_as_text`.
        """
        result = Dictionary()
        with utils.smart_open(fname) as f:
            for lineno, line in enumerate(f):
                line = utils.to_unicode(line)
                try:
                    wordid, word, docfreq = line[:-1].split('\t')
                except Exception:
                    raise ValueError("invalid line in dictionary file %s: %s"
                                     % (fname, line.strip()))
                wordid = int(wordid)
                if word in result.token2id:
                    raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
                result.token2id[word] = wordid
                result.dfs[wordid] = int(docfreq)
        return result 
Example #10
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #11
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_multiple_whitespaces(s):
    s = utils.to_unicode(s)
    return RE_WHITESPACE.sub(" ", s) 
Example #12
Source File: matutils.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __init__(self, input, transposed=True):
        """
        Initialize the matrix reader.

        The `input` refers to a file on local filesystem, which is expected to
        be in the sparse (coordinate) Matrix Market format. Documents are assumed
        to be rows of the matrix (and document features are columns).

        `input` is either a string (file path) or a file-like object that supports
        `seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
        """
        logger.info("initializing corpus reader from %s" % input)
        self.input, self.transposed = input, transposed
        with utils.file_or_filename(self.input) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
                    raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
                                    (self.input, header))
            except StopIteration:
                pass

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
                    if not self.transposed:
                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
                    break

        logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
                     (self.num_docs, self.num_terms, self.num_nnz)) 
Example #13
Source File: svmlightcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def line2doc(self, line):
        """
        Create a document from a single line (string) in SVMlight format
        """
        line = utils.to_unicode(line)
        line = line[: line.find('#')].strip()
        if not line:
            return None # ignore comments and empty lines
        parts = line.split()
        if not parts:
            raise ValueError('invalid line format in %s' % self.fname)
        target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
        doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
        return doc, target 
Example #14
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def filter_wiki(raw):
    """
    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
    or utf-8 encoded string.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text) # ' ' --> '\xa0'
    return remove_markup(text) 
Example #15
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_short(s, minsize=3):
    s = utils.to_unicode(s)
    return " ".join(e for e in s.split() if len(e) >= minsize) 
Example #16
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def remove_stopwords(s):
    s = utils.to_unicode(s)
    return " ".join(w for w in s.split() if w not in STOPWORDS) 
Example #17
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_punctuation(s):
    s = utils.to_unicode(s)
    return RE_PUNCT.sub(" ", s)


# unicode.translate cannot delete characters like str can 
Example #18
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_tags(s):
    s = utils.to_unicode(s)
    return RE_TAGS.sub("",s) 
Example #19
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_numeric(s):
    s = utils.to_unicode(s)
    return RE_NUMERIC.sub("", s) 
Example #20
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #21
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #22
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #23
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #24
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def preprocess_string(s, filters=DEFAULT_FILTERS):
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s.split() 
Example #25
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`.
    """
    text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split()) 
Example #26
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def remove_stopwords(s):
    s = utils.to_unicode(s)
    return " ".join(w for w in s.split() if w not in STOPWORDS) 
Example #27
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def split_alphanum(s):
    s = utils.to_unicode(s)
    s = RE_AL_NUM.sub(r"\1 \2", s)
    return RE_NUM_AL.sub(r"\1 \2", s) 
Example #28
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_multiple_whitespaces(s):
    s = utils.to_unicode(s)
    return RE_WHITESPACE.sub(" ", s) 
Example #29
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def strip_non_alphanum(s):
    s = utils.to_unicode(s)
    return RE_NONALPHA.sub(" ", s) 
Example #30
Source File: malletcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def line2doc(self, line):
        l = [word for word in utils.to_unicode(line).strip().split(' ') if word]
        docid, doclang, words = l[0], l[1], l[2:]

        doc = super(MalletCorpus, self).line2doc(' '.join(words))

        if self.metadata:
            return doc, (docid, doclang)
        else:
            return doc