python source code of TermDocMatrixWithoutCategories

scattertext-master
- demo_sklearn.py
- demo_general_inquirer.py
- demo_names.py
- demo_tsne_style_for_publication.py
- demo_embeddings_pca.py
- demo_insignificant_greyed_out.py
- demo_custom_coordinates.py
- simple.py
- demo_moral_foundations.py
- demo_dense_rank_difference.py
- demo_pair_plot_movies_doc2vec.py
- demo_two_axis.py
- scattertext
  - frequencyreaders
    - __init__.py
    - DefaultBackgroundFrequencies.py
  - characteristic
    - __init__.py
    - DenseRankCharacteristicness.py
  - diachronic
    - BubbleDiachronicVisualization.py
    - DiachronicTermMiner.py
    - DiachronicPairPlot.py
    - GanttChart.py
    - DiachronicVisualizer.py
    - __init__.py
    - TimeStructure.py
  - distancemeasures
    - DistanceMeasureBase.py
    - EuclideanDistance.py
    - __init__.py
  - TermDocMatrixFactory.py
  - DocsAndLabelsFromCorpus.py
  - graphs
    - ComponentDiGraph.py
    - GraphStructure.py
    - __init__.py
    - SimpleDiGraph.py
    - ComponentDiGraphHTMLRenderer.py
  - termscoring
    - ZScores.py
    - RankDifference.py
    - LogOddsUniformativePriorScore.py
    - CornerScore.py
    - CredTFIDF.py
    - ScaledFScore.py
    - CorpusBasedTermScorer.py
    - BM25Difference.py
    - RelativeEntropy.py
    - CohensDCalculator.py
    - MannWhitneyU.py
    - OLSUngnarStyle.py
    - __init__.py
    - test_credTFIDF.py
    - BetaPosterior.py
    - CohensD.py
  - categoryprojector
    - pairplot.py
    - CategoryProjectorEvaluator.py
    - OptimalProjection.py
    - CategoryProjection.py
    - __init__.py
    - CategoryProjector.py
  - PValGetter.py
  - CorpusFromParsedDocuments.py
  - representations
    - EmbeddingsResolver.py
    - GensimPhraseAugmenter.py
    - Doc2VecBuilder.py
    - CorpusSentenceIterator.py
    - __init__.py
    - Word2VecFromParsedCorpus.py
    - CategoryEmbeddings.py
  - Scalers.py
  - CorpusFromScikit.py
  - viz
    - VizDataAdapter.py
    - BasicHTMLFromScatterplotStructure.py
    - PairPlotFromScattertextStructure.py
    - __init__.py
    - HTMLSemioticSquareViz.py
    - ScatterplotStructure.py
  - Common.py
  - ScatterChartData.py
  - CorpusFromFeatureDict.py
  - TermDocMatrixFilter.py
  - topicmodel
    - interface
      - __init__.py
    - SentencesForTopicModeling.py
    - __init__.py
  - TermDocMatrixFromScikit.py
  - CategoryColorAssigner.py
  - test
    - test_CorpusFromParsedDocuments.py
    - test_featsFromScoredLexicon.py
    - test_autoTermSelector.py
    - test_indexStoreFromDict.py
    - test_CategoryColorAssigner.py
    - test_Scalers.py
    - test_termDocMatrixFromPandas.py
    - test_betaPosterior.py
    - test_termDocMatrixFactory.py
    - test_gensimPhraseAdder.py
    - test_scatterChartExplorer.py
    - test_diachronicTermMiner.py
    - test_WhitespaceNLP.py
    - test_useFullDocAsFeature.py
    - test_semioticSquareFromAxes.py
    - test_featureLister.py
    - test_semioticSquare.py
    - test_logOddsUninformativePriorScore.py
    - test_phraseSelector.py
    - test_HTMLVisualizationAssembly.py
    - test_fourSquareAxes.py
    - test_featsFromSpacyDocAndEmpath.py
    - test_termCategoryFrequencies.py
    - test_percentile_lexicographic.py
    - test_CSRMatrixTools.py
    - test_classPercentageCompactor.py
    - test_PriorFactory.py
    - test_embeddingsResolver.py
    - test_domainCompactor.py
    - test_PMIFiltering.py
    - test_extract_emoji.py
    - test_denseRankCharacteristicness.py
    - test_scaledFScore.py
    - test_vizDataAdapter.py
    - test_unigramsFromSpacyDoc.py
    - test_ZScores.py
    - test_cohensD.py
    - test_termRanker.py
    - test_corpusFromPandas.py
    - test_word2VecFromParsedCorpus.py
    - test_ParsedCorpus.py
    - test_combineDocsIntoDomains.py
    - test_corpusFromScikit.py
    - test_compactTerms.py
    - __init__.py
    - test_HTMLSemioticSquareViz.py
    - test_scatterChart.py
    - test_FeatsFromSpacyDoc.py
    - test_BM25Difference.py
    - test_corpusFromPandasWithoutCategories.py
    - test_credTFIDF.py
    - test_termDocMatrixFromScikit.py
    - test_logOddsRatioUninformativeDirichletPrior.py
    - test_large_int_format.py
    - test_useFullDocAsMetadata.py
    - test_associationCompator.py
    - test_CornerScore.py
    - test_asiannlp.py
    - test_oneClassScatterChart.py
    - test_TermDocMat.py
    - test_relativeEntropy.py
    - test_corpusFromFeatureDict.py
    - test_termDocMatrixFromFrequencies.py
    - test_docsAndLabelsFromCorpus.py
    - test_indexStore.py
    - test_indexStoreFromList.py
  - CorpusDF.py
  - termsignificance
    - LogOddsRatioUninformativeDirichletPrior.py
    - ScaledFScoreSignificance.py
    - __init__.py
    - LogOddsRatioSmoothed.py
    - LogOddsRatioInformativeDirichletPiror.py
    - TermSignificance.py
  - SampleCorpora.py
  - WhitespaceNLP.py
  - Formatter.py
  - CLI.py
  - external
    - phrasemachine
      - phrasemachine.py
      - __init__.py
    - __init__.py
  - termranking
    - DocLengthNormalizedFrequencyRanker.py
    - OncePerDocFrequencyRanker.py
    - DocLengthDividedFrequencyRanker.py
    - __init__.py
    - AbsoluteFrequencyRanker.py
    - TermRanker.py
  - PriorFactory.py
  - Corpus.py
  - ScatterChart.py
  - TermDocMatrixFromFrequencies.py
  - CorpusFromPandas.py
  - TermDocMatrixWithoutCategories.py
  - indexstore
    - IndexStoreFromDict.py
    - IndexStoreFromList.py
    - __init__.py
    - IndexStore.py
  - TermDocMatrixFromPandas.py
  - TermDocMatrix.py
  - data
    - mfd2.0.csv
    - viz
      - time_plot.html
      - semiotic_new.html
      - pairplot.html
      - scattertext.html
      - autocomplete.css
      - pairplot_without_halo.html
      - scripts
        range-tree.js
        rectangle-holder.js
        main.js
        d3-scale-chromatic.v1.min.js
        autocomplete_call.js
        autocomplete_definition.js
      - semiotic.html
      - search_form.html
      - graph_plot.html
    - hamlet.txt
    - presidential_debates_2016.csv.gz
  - CSRMatrixTools.py
  - OneClassScatterChart.py
  - __init__.py
  - SampleLexicons.py
  - FeatureOuput.py
  - features
    - PhraseMachinePhrases.py
    - UseFullDocAsMetadata.py
    - FeatsFromOnlyEmpath.py
    - SpacyEntities.py
    - FeatsFromMoralFoundationsDictionary.py
    - PyTextRankPhrases.py
    - FeatsFromScoredLexicon.py
    - FeatsFromSpacyDoc.py
    - FeatsFromSpacyDocOnlyEmoji.py
    - FeatsFromSpacyDocAndEmpath.py
    - UnigramsFromSpacyDoc.py
    - FeatsFromGeneralInquirer.py
    - __init__.py
    - UseFullDocAsFeature.py
    - FeastFromSentencePiece.py
    - FeatsFromSpacyDocOnlyNounChunks.py
    - FeatsFromTopicModel.py
  - domain
    - __init__.py
    - CombineDocsIntoDomains.py
  - ParsedCorpus.py
  - ScatterChartExplorer.py
  - AutoTermSelector.py
  - termcompaction
    - CompactTerms.py
    - ClassPercentageCompactor.py
    - DomainCompactor.py
    - PhraseSelector.py
    - ScikitCompactor.py
    - __init__.py
    - AssociationCompactor.py
  - semioticsquare
    - FourSquare.py
    - FourSquareAxis.py
    - SemioticSquareFromAxes.py
    - __init__.py
    - SemioticSquare.py
  - emojis
    - EmojiExtractor.py
    - __init__.py
    - ProcessedEmojiStructure.py
  - TermCategoryFrequencies.py
  - AsianNLP.py
  - DeployedClassifier.py
- demo_four_square.py
- demo_flashtext.py
- demo_gensim_similarity.py
- demo_focused_pair_plot_movies.py
- demo_unified_context.py
- demo_pair_plot_convention_geninq.py
- demo_pair_plot_convention_empath.py
- demo_log_odds_ratio_prior.py
- demo_semiotic.py
- demo_category_frequencies.py
- demo_without_spacy.py
- ISSUE_TEMPLATE
- demo_emoji.py
- demo_chinese.py
- LICENSE
- demo_obama.py
- demo_pair_plot_movies.py
- demo_relative_entropy.py
- demo.py
- demo_beta_posterior.py
- demo_general_inquirer_frequency_plot.py
- demo_pair_plot_movies_umap.py
- demo_tsne_style.py
- demo_tfidf.py
- demo_bm25.py
- demo_japanese.py
- demo_scaled_f_score.py
- PhraseMachineLicense.txt
- .gitattributes
- demo_mann_whitney.py
- demo_pair_plot_20_newsgroups.py
- demo_alt_tokenization.py
- setup.py
- distribution.sh
- demo_multi_category_pca.py
- demo_pca_documents.py
- demo_compact.py
- demo_pair_plot_convention.py
- regendocs.sh
- demo_bow_pca.py
- demo_pytextrank.py
- demo_dense_rank.py
- demo_axis_crossbars_and_labels.py
- demo_cohens_d.py
- demo_pair_plot_movies_mirror.py
- demo_feature_importance.py
- .travis.yml
- README.md
- demo_expected_vs_actual.py
- demo_nmf_topic_model.py
- demo_include_all_contexts.py
- demo_similarity.py
- demo_compact_suppress_documents.py
- demo_characteristic_chart.py
- demo_phrase_machine.py
- demo_sentence_piece.py
- demo_empath.py
- demo_z_scores.py
- demo_word_list_topic_model.py
- demo_umap_documents.py
- demo_pair_plot_movies_phate.py
- demo_custom_topic_model.py
- demo_sparse.py
- demo_pair_plot_category_focused.py
- demo_cred_tfidf.py
- demo_hedges_r.py

import collections
import re

import numpy as np
import pandas as pd

from scattertext.CSRMatrixTools import delete_columns, CSRMatrixFactory
from scattertext.FeatureOuput import FeatureLister
from scattertext.Common import SPACY_ENTITY_TAGS, MY_ENGLISH_STOP_WORDS, DEFAULT_BACKGROUND_SCALER_ALGO, \
    DEFAULT_BACKGROUND_BETA
from scattertext.frequencyreaders.DefaultBackgroundFrequencies import DefaultBackgroundFrequencies
from scattertext.termranking import AbsoluteFrequencyRanker
from scattertext.termscoring import ScaledFScore
from scattertext.indexstore.IndexStore import IndexStore


class TermDocMatrixWithoutCategories(object):
    def __init__(self, X, mX, term_idx_store, metadata_idx_store, unigram_frequency_path=None):
        '''

        Parameters
        ----------
        X : csr_matrix
            term document matrix
        mX : csr_matrix
            metadata-document matrix
        term_idx_store : IndexStore
            Term indices
        metadata_idx_store : IndexStore
          Document metadata indices
        unigram_frequency_path : str or None
            Path to term frequency file.
        '''
        self._X = X
        self._mX = mX
        self._term_idx_store = term_idx_store
        self._metadata_idx_store = metadata_idx_store
        self._unigram_frequency_path = unigram_frequency_path
        self._background_corpus = None
        self._strict_unigram_definition = True

    def get_default_stoplist(self):
        return MY_ENGLISH_STOP_WORDS

    def allow_single_quotes_in_unigrams(self):
        '''
        Don't filter out single quotes in unigrams
        :return: self
        '''
        self._strict_unigram_definition = False
        return self


    def compact(self, compactor, non_text=False):
        '''
        Compact term document matrix.

        Parameters
        ----------
        compactor : object
            Object that takes a Term Doc Matrix as its first argument, and has a compact function which returns a
            Term Doc Matrix like argument
        non_text : bool
            Use non text features. False by default.
        Returns
        -------
        TermDocMatrix
        '''
        return compactor.compact(self, non_text)

    def select(self, compactor, non_text=False):
        '''
        Same as compact
        '''
        return compactor.compact(self, non_text)

    def get_num_terms(self):
        '''
        Returns
        -------
        The number of terms registered in the term doc matrix
        '''
        return len(self._term_idx_store)

    def get_num_docs(self):
        '''
        Returns
        -------
        int, number of documents
        '''
        return self._X.shape[0]

    def get_num_metadata(self):
        '''
        Returns
        -------
        int, number of unique metadata items
        '''
        return len(self.get_metadata())

    def set_background_corpus(self, background):
        '''
        Parameters
        ----------
        background

        '''
        if issubclass(type(background), TermDocMatrixWithoutCategories):
            self._background_corpus = pd.DataFrame(background
                                                   .get_term_freq_df()
                                                   .sum(axis=1),
                                                   columns=['background']).reset_index()
            self._background_corpus.columns = ['word', 'background']
        elif (type(background) == pd.DataFrame
              and set(background.columns) == set(['word', 'background'])):
            self._background_corpus = background
        else:
            raise Exception('The argument named background must be a subclass of TermDocMatrix or a ' \
                            + 'DataFrame with columns "word" and "background", where "word" ' \
                            + 'is the term text, and "background" is its frequency.')

    def get_background_corpus(self):
        if self._background_corpus is not None:
            return self._background_corpus
        return DefaultBackgroundFrequencies.get_background_frequency_df(self._unigram_frequency_path)

    def get_term_and_background_counts(self):
        '''
        Returns
        -------
        A pd.DataFrame consisting of unigram term counts of words occurring
         in the TermDocumentMatrix and their corresponding background corpus
         counts.  The dataframe has two columns, corpus and background.

        >>> corpus.get_unigram_corpus().get_term_and_background_counts()
                          corpus  background
        obama              702.0    565739.0
        romney             570.0    695398.0
        barack             248.0    227861.0
        ...
        '''
        background_df = self._get_background_unigram_frequencies()
        corpus_freq_df = self.get_term_count_df()
        corpus_unigram_freq = self._get_corpus_unigram_freq(corpus_freq_df)
        df = corpus_unigram_freq.join(background_df, how='outer').fillna(0)
        return df

    def get_term_count_df(self):
        return pd.DataFrame({'corpus': self._X.sum(axis=0).A1, 'term': self.get_terms()}).set_index('term')

    def _get_corpus_unigram_freq(self, corpus_freq_df):
        unigram_validator = re.compile('^[A-Za-z]+$')
        corpus_unigram_freq = corpus_freq_df.loc[[term for term
                                                 in corpus_freq_df.index
                                                 if unigram_validator.match(term) is not None]]
        return corpus_unigram_freq

    def _get_background_unigram_frequencies(self):
        if self.get_background_corpus() is not None:
            return self.get_background_corpus()
        return DefaultBackgroundFrequencies.get_background_frequency_df(self._unigram_frequency_path)

    def list_extra_features(self):
        '''
        Returns
        -------
        List of dicts.  One dict for each document, keys are metadata, values are counts
        '''
        return FeatureLister(self._mX,
                             self._metadata_idx_store,
                             self.get_num_docs()).output()

    def get_terms(self):
        '''
        Returns
        -------
        np.array of unique terms
        '''
        return self._term_idx_store._i2val

    def get_metadata(self):
        '''
        Returns
        -------
        np.array of unique metadata
        '''
        return self._metadata_idx_store._i2val

    def get_total_unigram_count(self):
        return self._get_unigram_term_freq_df().sum()

    def _get_unigram_term_freq_df(self):
        return self._get_corpus_unigram_freq(
            # self.get_term_freq_df().sum(axis=1)
            self.get_term_count_df()['corpus']
        )

    def _get_X_after_delete_terms(self, idx_to_delete_list, non_text=False):
        new_term_idx_store = self._get_relevant_idx_store(non_text).batch_delete_idx(idx_to_delete_list)
        new_X = delete_columns(self._get_relevant_X(non_text), idx_to_delete_list)
        return new_X, new_term_idx_store

    def _get_relevant_X(self, non_text):
        return self._mX if non_text else self._X

    def _get_relevant_idx_store(self, non_text):
        return self._metadata_idx_store if non_text else self._term_idx_store

    def remove_infrequent_words(self, minimum_term_count, term_ranker=AbsoluteFrequencyRanker):
        '''
        Returns
        -------
        A new TermDocumentMatrix consisting of only terms which occur at least minimum_term_count.
        '''
        tdf = term_ranker(self).get_ranks().sum(axis=1)
        return self.remove_terms(list(tdf[tdf <= minimum_term_count].index))

    def remove_entity_tags(self):
        '''
        Returns
        -------
        A new TermDocumentMatrix consisting of only terms in the current TermDocumentMatrix
         that aren't spaCy entity tags.

        Note: Used if entity types are censored using FeatsFromSpacyDoc(tag_types_to_censor=...).
        '''
        terms_to_remove = [term for term in self._term_idx_store._i2val
                           if any([word in SPACY_ENTITY_TAGS for word in term.split()])]
        return self.remove_terms(terms_to_remove)

    def remove_terms(self, terms, ignore_absences=False, non_text=False):
        '''Non destructive term removal.

        Parameters
        ----------
        terms : list
            list of terms to remove
        ignore_absences : bool, False by default
            If term does not appear, don't raise an error, just move on.
        non_text : bool, False by default
            Remove metadata terms instead of regular terms

        Returns
        -------
        TermDocMatrix, new object with terms removed.
        '''
        idx_to_delete_list = self._build_term_index_list(ignore_absences, terms, non_text)
        return self.remove_terms_by_indices(idx_to_delete_list, non_text)


    def whitelist_terms(self, whitelist_terms):
        '''

        :param whitelist_terms: list[str], terms to whitelist
        :return: TermDocMatrix, new object with only terms in parameter
        '''
        return self.remove_terms(list(set(self.get_terms()) - set(whitelist_terms)))

    def _build_term_index_list(self, ignore_absences, terms, non_text=False):
        idx_to_delete_list = []
        my_term_idx_store = self._get_relevant_idx_store(non_text)
        for term in terms:
            if term not in my_term_idx_store:
                if not ignore_absences:
                    raise KeyError('Term %s not found' % (term))
                continue
            idx_to_delete_list.append(my_term_idx_store.getidx(term))
        return idx_to_delete_list

    def _make_new_term_doc_matrix(self,
                                  new_X=None,
                                  new_mX=None,
                                  new_y=None,
                                  new_term_idx_store=None,
                                  new_category_idx_store=None,
                                  new_metadata_idx_store=None,
                                  new_y_mask=None):
        return TermDocMatrixWithoutCategories(
            X=new_X if new_X is not None else self._X,
            mX=new_mX if new_mX is not None else self._mX,
            term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
            metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None else self._metadata_idx_store,
            unigram_frequency_path=self._unigram_frequency_path
        )

    def remove_terms_used_in_less_than_num_docs(self, threshold, non_text=False):
        '''
        Parameters
        ----------
        threshold: int
            Minimum number of documents term should appear in to be kept
        non_text: bool
            Use non-text features instead of terms

        Returns
        -------
        TermDocMatrix, new object with terms removed.
        '''
        term_counts = self._get_relevant_X(non_text).astype(bool).astype(int).sum(axis=0).A[0]
        terms_to_remove = np.where(term_counts < threshold)[0]
        return self.remove_terms_by_indices(terms_to_remove, non_text)

    def get_unigram_corpus(self):
        '''
        Returns
        -------
        A new TermDocumentMatrix consisting of only unigrams in the current TermDocumentMatrix.
        '''
        terms_to_ignore = self._get_non_unigrams()
        return self.remove_terms(terms_to_ignore)

    def _get_non_unigrams(self):
        return [term for term
                in self._term_idx_store._i2val
                if ' ' in term or (self._strict_unigram_definition and "'" in term)
        ]

    def get_stoplisted_unigram_corpus(self, stoplist=None):
        '''
        Parameters
        -------
        stoplist : list, optional

        Returns
        -------
        A new TermDocumentMatrix consisting of only unigrams in the current TermDocumentMatrix.
        '''
        if stoplist is None:
            stoplist = self.get_default_stoplist()
        else:
            stoplist = [w.lower() for w in stoplist]
        return self._remove_terms_from_list(stoplist)

    def get_stoplisted_unigram_corpus_and_custom(self,
                                                 custom_stoplist):
        '''
        Parameters
        -------
        stoplist : list of lower-cased words, optional

        Returns
        -------
        A new TermDocumentMatrix consisting of only unigrams in the current TermDocumentMatrix.
        '''
        if type(custom_stoplist) == str:
            custom_stoplist = [custom_stoplist]
        return self._remove_terms_from_list(set(self.get_default_stoplist())
                                            | set(w.lower() for w in custom_stoplist))

    def _remove_terms_from_list(self, stoplist):
        terms_to_ignore = [term for term
                           in self._term_idx_store._i2val
                           if ' ' in term or (self._strict_unigram_definition and "'" in term)
                           or term in stoplist]
        return self.remove_terms(terms_to_ignore)

    def metadata_in_use(self):
        '''
        Returns True if metadata values are in term doc matrix.

        Returns
        -------
        bool
        '''
        return len(self._metadata_idx_store) > 0

    def _make_all_positive_data_ones(self, newX):
        # type: (sparse_matrix) -> sparse_matrix
        return (newX > 0).astype(np.int32)

    def get_doc_lengths(self):
        '''
        Returns a list of document lengths in words

        Returns
        -------
        np.array
        '''
        idx_to_delete_list = self._build_term_index_list(True, self._get_non_unigrams())
        unigram_X, _ = self._get_X_after_delete_terms(idx_to_delete_list)
        return unigram_X.sum(axis=1).A1

    def remove_terms_by_indices(self, idx_to_delete_list, non_text=False):
        '''
        Parameters
        ----------
        idx_to_delete_list, list
        non_text, bool
            Should we remove non text features or just terms?

        Returns
        -------
        TermDocMatrix
        '''
        new_X, new_idx_store = self._get_X_after_delete_terms(idx_to_delete_list, non_text)

        return self._make_new_term_doc_matrix(new_X=self._X if non_text else new_X,
                                              new_mX=new_X if non_text else self._mX,
                                              new_y=None,
                                              new_category_idx_store=None,
                                              new_term_idx_store=self._term_idx_store if non_text else new_idx_store,
                                              new_metadata_idx_store=(new_idx_store if non_text
                                                                      else self._metadata_idx_store),
                                              new_y_mask=np.ones(new_X.shape[0]).astype(np.bool))

    def get_scaled_f_scores_vs_background(self,
                                          scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO,
                                          beta=DEFAULT_BACKGROUND_BETA):
        '''
        Parameters
        ----------
        scaler_algo : str
            see get_scaled_f_scores, default 'none'
        beta : float
          default 1.
        Returns
        -------
        pd.DataFrame of scaled_f_score scores compared to background corpus
        '''
        df = self.get_term_and_background_counts()
        df['Scaled f-score'] = ScaledFScore.get_scores_for_category(
            df['corpus'], df['background'], scaler_algo, beta
        )
        return df.sort_values(by='Scaled f-score', ascending=False)

    def get_term_doc_mat(self):
        '''
        Returns sparse matrix representation of term-doc-matrix

        Returns
        -------
        scipy.sparse.csr_matrix
        '''
        return self._X

    def get_metadata_doc_mat(self):
        '''
        Returns sparse matrix representation of term-doc-matrix

        Returns
        -------
        scipy.sparse.csr_matrix
        '''
        return self._mX

    def term_doc_lists(self):
        '''
        Returns
        -------
        dict
        '''
        doc_ids = self._X.transpose().tolil().rows
        terms = self._term_idx_store.values()
        return dict(zip(terms, doc_ids))

    def apply_ranker(self, term_ranker, use_non_text_features):
        '''
        Parameters
        ----------
        term_ranker : TermRanker

        Returns
        -------
        pd.Dataframe
        '''
        if use_non_text_features:
            return term_ranker(self).use_non_text_features().get_ranks()
        return term_ranker(self).get_ranks()

    def add_doc_names_as_metadata(self, doc_names):
        '''
        :param doc_names: array-like[str], document names of reach document
        :return: Corpus-like object with doc names as metadata. If two documents share the same name
        (doc number) will be appended to their names.
        '''
        if len(doc_names) != self.get_num_docs():
            raise Exception("The parameter doc_names contains %s elements. "
                            "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs()))

        doc_names_counter = collections.Counter(np.array(doc_names))
        metafact = CSRMatrixFactory()
        metaidxstore = IndexStore()
        doc_id_uses = collections.Counter()
        for i in range(self.get_num_docs()):
            doc_id = doc_names[i]
            if doc_names_counter[doc_id] > 1:
                doc_id_uses[doc_id] += 1
                doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id]))
            else:
                doc_name_idx = metaidxstore.getidx(doc_id)
            metafact[i, i] = doc_name_idx
        return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)

    def add_metadata(self, metadata_matrix, meta_index_store):
        '''
        Returns a new corpus with a the metadata matrix and index store integrated.

        :param metadata_matrix: scipy.sparse matrix (# docs, # metadata)
        :param meta_index_store: IndexStore of metadata values
        :return: TermDocMatrixWithoutCategories
        '''
        assert isinstance(meta_index_store, IndexStore)
        assert len(metadata_matrix.shape) == 2
        assert metadata_matrix.shape[0] == self.get_num_docs()
        return self._make_new_term_doc_matrix(new_X=self._X,
                                              new_y=None,
                                              new_category_idx_store=None,
                                              new_y_mask=np.ones(self.get_num_docs()).astype(bool),
                                              new_mX=metadata_matrix,
                                              new_term_idx_store=self._term_idx_store,
                                              new_metadata_idx_store=meta_index_store)