Python sklearn.feature_extraction.text.CountVectorizer() Examples

The following are 30 code examples for showing how to use sklearn.feature_extraction.text.CountVectorizer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.feature_extraction.text , or try the search function .

Example 1
Project: nyoka   Author: nyoka-pmml   File: _validateSchema.py    License: Apache License 2.0 7 votes vote down vote up
def test_validate_sklearn_sgd_with_text_cv(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_CountVec_.pmml'
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True) 
Example 2
Project: nyoka   Author: nyoka-pmml   File: testScoreWithAdapaLgbm.py    License: Apache License 2.0 7 votes vote down vote up
def test_04_lgbm_regressor(self):
        print("\ntest 04 (lgbm regressor with preprocessing)\n")
        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        X = auto.drop(['mpg'], axis=1)
        y = auto['mpg']

        feature_names = [name for name in auto.columns if name not in ('mpg')]
        target_name='mpg'
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
        pd.DataFrame(data=x_test, columns=feature_names).to_csv("test.csv",index=False)
        pipeline_obj = Pipeline([
            ('mapper', DataFrameMapper([
                ('car name', CountVectorizer()),
                (['displacement'],[StandardScaler()]) 
            ])),
            ('lgbmr',LGBMRegressor())
        ])
        pipeline_obj.fit(x_train,y_train)
        file_name = "test04lgbm.pmml"
        lgb_to_pmml(pipeline_obj, feature_names, 'mpg', file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, _ = self.adapa_utility.score_in_zserver(model_name, "test.csv")
        predictions = numpy.array(predictions)
        model_pred = pipeline_obj.predict(x_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True) 
Example 3
Project: interpret-text   Author: interpretml   File: common_utils.py    License: MIT License 6 votes vote down vote up
def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)]) 
Example 4
Project: PathCon   Author: hwwang55   File: data_loader.py    License: MIT License 6 votes vote down vote up
def read_relations(file_name):
    bow = []
    count_vec = CountVectorizer()

    d = {}
    file = open(file_name)
    for line in file:
        index, name = line.strip().split('\t')
        d[name] = int(index)

        if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
            tokens = re.findall('[a-z]{2,}', name)
            bow.append(' '.join(tokens))
    file.close()

    if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
        bow = count_vec.fit_transform(bow)
        np.save('../data/' + args.dataset + '/bow.npy', bow.toarray())

    return d 
Example 5
Project: chowmein   Author: xiaohan2012   File: test_pmi_w2l.py    License: MIT License 6 votes vote down vote up
def test_from_texts():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels)
    assert_equal(actual.shape[1], 4)
    assert_equal(actual.shape[0], 9)
    assert_equal(cal.index2word_, {0: u'information',
                                   1: u'language',
                                   2: u'learning',
                                   3: u'machine',
                                   4: u'mining',
                                   5: u'natural',
                                   6: u'processing',
                                   7: u'retrieval',
                                   8: u'text'})
    assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
                                    1: 'machine learning'.split(),
                                    2: 'natural language processing'.split(),
                                    3: 'text mining'.split()}) 
Example 6
Project: scattertext   Author: JasonKessler   File: test_corpusFromScikit.py    License: Apache License 2.0 6 votes vote down vote up
def _te_ss_t_build(self):
		from sklearn.datasets import fetch_20newsgroups
		from sklearn.feature_extraction.text import CountVectorizer

		newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
		corpus = CorpusFromScikit(
			X=X_counts,
			y=newsgroups_train.target,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=newsgroups_train.target_names,
			raw_texts=newsgroups_train.data
		).build()
		self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
		self.assertEqual(corpus
		                 .get_term_freq_df()
		                 .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
		self.assertGreater(len(corpus.get_texts()[0]), 5) 
Example 7
Project: scattertext   Author: JasonKessler   File: test_termDocMatrixFromScikit.py    License: Apache License 2.0 6 votes vote down vote up
def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes']) 
Example 8
Project: metal   Author: HazyResearch   File: ngram_featurizer.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example 9
Project: fanci   Author: fanci-dga-detection   File: feature_extraction.py    License: GNU General Public License v3.0 6 votes vote down vote up
def _n_grams():
    """
    Calculates various statistical features over the 1-,2- and 3-grams of the suffix and dot free domain
    :return: 
    """
    global __unigram
    feature = []

    for i in range(1,4):
        ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(i, i))
        counts = ngram_vectorizer.build_analyzer()(__joined_dot_split_suffix_free)
        npa = numpy.array(list(Counter(counts).values()), dtype=int)
        if i == 1:
            __unigram = npa

        feature += __stats_over_n_grams(npa)

    return feature 
Example 10
Project: cltk   Author: cltk   File: stop.py    License: MIT License 6 votes vote down vote up
def __init__(self, language=None):
        """ Initialize stoplist builder with option for language specific parameters
        :type language: str
        :param language : text from which to build the stoplist
        """
        if language:
            self.language = language.lower()
        self.numpy_installed = True  # Write utility for common import traps?
        self.sklearn_installed = True

        try:
            import numpy as np
            self.np = np
        except ImportError:
            self.numpy_installed = False

        try:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            # self.vectorizer = CountVectorizer(input='content') # Set df?
            # self.tfidf_vectorizer = TfidfVectorizer()
        except ImportError:
            self.sklearn_installed = False 
Example 11
Project: text-classifier   Author: shibing624   File: feature.py    License: Apache License 2.0 6 votes vote down vote up
def tf_word_feature(self, data_set):
        """
        Get TF feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = CountVectorizer(vocabulary=self.word_vocab)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        feature_names = self.vectorizer.get_feature_names()
        logger.info('feature_names:%s' % feature_names[:20])
        logger.info(data_feature.shape)
        if not self.is_infer:
            save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature 
Example 12
Project: Spider   Author: starFalll   File: LDA_Analysis.py    License: MIT License 6 votes vote down vote up
def word2vec(word_list,n_features=1000,topics = 5):
    tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                    max_features=n_features,
                                    #stop_words='english',
                                    max_df=0.5,
                                    min_df=10)
    tf = tf_vectorizer.fit_transform(word_list)

    lda = LatentDirichletAllocation(n_components=topics,#主题数
                                    learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调
                                    )
    #用变分贝叶斯方法训练模型
    lda.fit(tf)

    #依次输出每个主题的关键词表
    tf_feature_names = tf_vectorizer.get_feature_names()

    return lda,tf,tf_feature_names,tf_vectorizer

#将主题以可视化结果展现出来 
Example 13
Project: lexpredict-contraxsuite   Author: LexPredict   File: field_types.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
                           2. List of str feature names or a function returning list of str feature names.
        """

        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words())
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect) 
Example 14
Project: interpret-text   Author: interpretml   File: utils_classical.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        """Initializes the Encoder object and sets internal tokenizer,
            labelEncoder and vectorizer using predefined objects.
        """
        self.tokenizer = BOWTokenizer(
            English()
        )  # the tokenizer must have a tokenize() and parse() function.
        self.labelEncoder = LabelEncoder()
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizer.tokenize, ngram_range=(1, 1)
        )
        self.decode_params = {}

    # The keep_ids flag, is used by explain local in the explainer to decode
    # importances over raw features. 
Example 15
Project: interpret-text   Author: interpretml   File: common_utils.py    License: MIT License 5 votes vote down vote up
def create_random_forest_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    rf = RandomForestClassifier(n_estimators=500, random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("rf", rf)]) 
Example 16
Project: interpret-text   Author: interpretml   File: common_utils.py    License: MIT License 5 votes vote down vote up
def create_linear_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LinearRegression()
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)]) 
Example 17
Project: recordlinkage   Author: J535D165   File: string.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def qgram_similarity(s1, s2, include_wb=True, ngram=(2, 2)):

    if len(s1) != len(s2):
        raise ValueError('Arrays or Series have to be same length.')

    if len(s1) == len(s2) == 0:
        return []

    # include word boundaries or not
    analyzer = 'char_wb' if include_wb is True else 'char'

    # prepare data
    data = s1.append(s2).fillna('')

    # The vectorizer
    vectorizer = CountVectorizer(
        analyzer=analyzer, strip_accents='unicode', ngram_range=ngram)

    vec_fit = vectorizer.fit_transform(data)

    def _metric_sparse_euclidean(u, v):

        match_ngrams = u.minimum(v).sum(axis=1)
        total_ngrams = np.maximum(u.sum(axis=1), v.sum(axis=1))

        # division by zero is not possible in our case, but 0/0 is possible.
        # Numpy raises a warning in that case.

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            m = np.true_divide(match_ngrams, total_ngrams).A1

        return m

    return _metric_sparse_euclidean(vec_fit[:len(s1)], vec_fit[len(s1):]) 
Example 18
Project: recordlinkage   Author: J535D165   File: string.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cosine_similarity(s1, s2, include_wb=True, ngram=(2, 2)):

    if len(s1) != len(s2):
        raise ValueError('Arrays or Series have to be same length.')

    if len(s1) == len(s2) == 0:
        return []

    # include word boundaries or not
    analyzer = 'char_wb' if include_wb is True else 'char'

    # The vectorizer
    vectorizer = CountVectorizer(
        analyzer=analyzer, strip_accents='unicode', ngram_range=ngram)

    data = s1.append(s2).fillna('')

    vec_fit = vectorizer.fit_transform(data)

    def _metric_sparse_cosine(u, v):

        a = np.sqrt(u.multiply(u).sum(axis=1))
        b = np.sqrt(v.multiply(v).sum(axis=1))

        ab = v.multiply(u).sum(axis=1)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            m = np.divide(ab, np.multiply(a, b)).A1

        return m

    return _metric_sparse_cosine(vec_fit[:len(s1)], vec_fit[len(s1):]) 
Example 19
Project: corpus-to-graph-ml   Author: CatalystCode   File: features_generation_tools.py    License: MIT License 5 votes vote down vote up
def to_bag_of_words(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, 
                      max_features=DEFAULT_BOW_MAX_FEATURES, binary=DEFAULT_BOW_BINARY):
        #Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
        vectorizer = CountVectorizer(analyzer = "word",
                                     tokenizer = None,
                                     preprocessor = None,
                                     stop_words = None,
                                     max_features = max_features,
                                     binary = binary,
                                     ngram_range=ngram_range)

        train_data_features = vectorizer.fit_transform(train_samples)
        test_data_features = vectorizer.transform(test_samples)
        return train_data_features, test_data_features, vectorizer 
Example 20
Project: chowmein   Author: xiaohan2012   File: test_pmi_w2l.py    License: MIT License 5 votes vote down vote up
def test_from_texts_nonexisting_label():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels[:2] + [('haha', 'lala')] +
                            labels[2:] + [('non', 'existing')])
    assert_equal(actual.shape[1], 4)
    assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
                                    1: 'machine learning'.split(),
                                    2: 'natural language processing'.split(),
                                    3: 'text mining'.split()}) 
Example 21
Project: Zeroshot-QuestionGeneration   Author: hadyelsahar   File: baselines.py    License: MIT License 5 votes vote down vote up
def train(self, x, y, textual_evidence=None):

        if self.TEXT:
            x = textual_evidence

            x = [str(i) for i in x]  # make sure every word is string

        self.train_x = x
        self.train_y = y

        # VECTORIZATION
        print("vectorization..")
        # X = np.concatenate([train['triples_prep'], test['triples_prep']])

        self.count_vect = CountVectorizer().fit(x)
        x = self.count_vect.transform(x)

        self.tf_transformer = TfidfTransformer().fit(x)
        x = self.tf_transformer.transform(x)

        self.svd = TruncatedSVD(n_components=self.N_COMPONENTS).fit(x)
        x = self.svd.transform(x)

        # CLUSTERING
        print("clustering..")
        self.neigh = NearestNeighbors(self.K, self.RADIUS)
        self.neigh.fit(x)  # clustering only training set 
Example 22
Project: fever-naacl-2018   Author: sheffieldnlp   File: fever_features.py    License: Apache License 2.0 5 votes vote down vote up
def inform(self,train,dev=None,test=None):
        claims = self.claims(train)
        bodies = self.bodies(train)

        if dev is not None:
            dev_claims = self.claims(dev)
            dev_bodies = self.bodies(dev)
        else:
            dev_claims = []
            dev_bodies = []

        if test is not None:
            test_claims = self.claims(test)
            test_bodies = self.bodies(test)
        else:
            test_claims = []
            test_bodies = []

        self.logger.info("Count word frequencies")
        self.bow_vectorizer = CountVectorizer(max_features=self.lim_unigram,
                                         stop_words=TermFrequencyFeatureFunction.stop_words)
        self.bow = self.bow_vectorizer.fit_transform(claims + bodies)

        self.logger.info("Generate TF Vectors")
        self.tfreq_vectorizer = TfidfTransformer(use_idf=False).fit(self.bow)

        self.logger.info("Generate TF-IDF Vectors")
        self.tfidf_vectorizer = TfidfVectorizer(max_features=self.lim_unigram,
                                           stop_words=TermFrequencyFeatureFunction.stop_words). \
            fit(claims + bodies + dev_claims + dev_bodies + test_claims + test_bodies) 
Example 23
Project: rasa_lookup_demo   Author: RasaHQ   File: create_ngrams.py    License: Apache License 2.0 5 votes vote down vote up
def transorm_ngrams(df_x, x_train, x_test):
    print("\ntransforming inputs with ngrams...")
    vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer="char")
    X = vectorizer.fit_transform(df_x)
    X_train = vectorizer.transform(x_train)
    X_test = vectorizer.transform(x_test)
    names = vectorizer.get_feature_names()
    return X_train, X_test, names 
Example 24
Project: kaggle-HomeDepot   Author: ChenglongChen   File: feature_vector_space.py    License: MIT License 5 votes vote down vote up
def _init_word_bow(self, ngram, vocabulary=None):
        bow = CountVectorizer(min_df=3,
                                max_df=0.75,
                                max_features=None,
                                # norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                vocabulary=vocabulary)
        return bow

    ## word based 
Example 25
def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

# vectorizer = CountVectorizer(min_df=1, stop_words='english',
# preprocessor=stemmer) 
Example 26
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def tokenize_documents(documents,max_df0=0.9, min_df0=0.001):
	from nltk.corpus import stopwords
	'''
	From a list of documents raw text build a matrix DxV
	D: number of docs
	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
	'''
	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
	corpus = count_vect.fit_transform(documents)
	vocabulary = count_vect.get_feature_names()
	
	return corpus,vocabulary,count_vect 
Example 27
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005):
	from nltk.corpus import stopwords
	'''
	From a list of documents raw text build a matrix DxV
	D: number of docs
	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
	'''
	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
	corpus = count_vect.fit_transform(documents)
	vocabulary = count_vect.get_feature_names()
	
	return corpus,vocabulary,count_vect 
Example 28
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def tokenize_documents(documents,max_df0=0.9, min_df0=0.001):
	from nltk.corpus import stopwords
	'''
	From a list of documents raw text build a matrix DxV
	D: number of docs
	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
	'''
	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
	corpus = count_vect.fit_transform(documents)
	vocabulary = count_vect.get_feature_names()
	
	return corpus,vocabulary,count_vect 
Example 29
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def tokenize_documents(documents,max_df0=0.8, min_df0=0.01,print_vocabulary=False,outfolder=None,output_vocabulary_fname='vocabulary.dat'):
	from nltk.corpus import stopwords
	'''
	From a list of documents raw text build a matrix DxV
	D: number of docs
	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
	'''
	stop = stopwords.words('english')
	count_vect = CountVectorizer(stop_words=stop,max_df=max_df0, min_df=min_df0)
	corpus = count_vect.fit_transform(documents)
	vocabulary = count_vect.get_feature_names()
	
	return corpus,vocabulary,count_vect 
Example 30
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_pipeline.py    License: MIT License 5 votes vote down vote up
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert "chars__" in feat or "words__" in feat
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(AttributeError,
                         'Transformer tr1 (type Transf) does not provide '
                         'get_feature_names', ft.get_feature_names)