Python nltk.PorterStemmer() Examples

The following are code examples for showing how to use nltk.PorterStemmer(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: metal   Author: HazyResearch   File: ngram_featurizer.py    Apache License 2.0 7 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example 2
Project: qa-scrapers   Author: collab-uniba   File: discretizer.py    MIT License 6 votes vote down vote up
def build_stems(self, corpus):
        stems_filename = '{0}_stems.txt'.format(self.db_name)
        if os.path.isfile(stems_filename):  # load stems from file
            stems = list()
            self.log('Loading existing stems from {0}'.format(stems_filename), logging.INFO)
            with open(stems_filename, 'rt') as f:
                for stem in f:
                    stems.append(stem.strip())
                f.close()
        else:
            self.log('Creating stems from corpus into {0}. Please wait, this may take some time.'.format(
                stems_filename), logging.INFO)
            porter_stemmer = PorterStemmer()
            try:
                stems = [porter_stemmer.stem(token.lower().decode('utf-8', 'replace').encode('ascii', 'replace'))
                         for token in corpus]
            except (AttributeError, TypeError) as e:
                stems = [porter_stemmer.stem(token.lower())
                         for token in corpus]
            with open(stems_filename, 'wt') as f:
                for stem in stems:
                    f.write('{0}{1}'.format(stem, self.linesep))
                f.close()
        return stems 
Example 3
Project: qa-scrapers   Author: collab-uniba   File: discretization.py    MIT License 6 votes vote down vote up
def build_stems(self, corpus):
        stems_filename = '{0}_stems.txt'.format(self.db_name)
        if os.path.isfile(stems_filename):  # load stems from file
            stems = list()
            self.log('Loading existing stems from {0}'.format(stems_filename), logging.INFO)
            with open(stems_filename, 'rt') as f:
                for stem in f:
                    stems.append(stem.strip())
                f.close()
        else:
            self.log('Creating stems from corpus into {0}. Please wait, this may take some time.'.format(
                stems_filename), logging.INFO)
            porter_stemmer = PorterStemmer()
            try:
                stems = [porter_stemmer.stem(token.lower().decode('utf-8', 'replace').encode('ascii', 'replace'))
                         for token in corpus]
            except (AttributeError, TypeError) as e:
                stems = [porter_stemmer.stem(token.lower())
                         for token in corpus]
            with open(stems_filename, 'wt') as f:
                for stem in stems:
                    f.write('{0}{1}'.format(stem, self.linesep))
                f.close()
        return stems 
Example 4
Project: NINJAS   Author: Beerstorm   File: Index.py    GNU General Public License v2.0 6 votes vote down vote up
def __init__(self, corpus=None):
        """ Crée un Index à partir d'un Corpus

        doclist -- la liste de Documents à indexer
        """
        self._table = dict()
        self._stemm_table = dict()
        self._ignored_signs = re.compile("[.,;:!?()\"'`_{}]")
        self._blank_signs   = re.compile("[-\n\r]")
        # Chargement des stopwords
        self._stopwords = stopwords.words('english')
        # Instanciation du stemmer
        self._stemmer = PorterStemmer()

        if corpus:
            self.load_corpus(corpus)

    #
    # Private Functions
    # 
Example 5
Project: lda_2003   Author: xiaohan2012   File: util.py    MIT License 6 votes vote down vote up
def load_line_corpus(path):
    docs = []
    
    stopwords = load_items_by_line(CURDIR + '/data/lemur-stopwords.txt')

    stemmer = nltk.PorterStemmer()
    
    with codecs.open(path, "r", "utf8") as f:
        for l in f:
            sents = nltk.sent_tokenize(l.strip().lower())
            tokenized_sents = map(nltk.word_tokenize, sents)
            doc = [stemmer.stem(word.lower())
                   for sent in tokenized_sents
                   for word in sent if word not in stopwords and len(word) > 2]
            docs.append(doc)

    return docs 
Example 6
Project: nlp-akash   Author: akashp1712   File: TF_IDF_Summarization.py    MIT License 6 votes vote down vote up
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.

    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable 
Example 7
Project: nlp-akash   Author: akashp1712   File: TF_IDF_Summarization.py    MIT License 6 votes vote down vote up
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix 
Example 8
Project: tensorflow_generate_headlines   Author: FanWan   File: textrank_word2vec.py    GNU General Public License v3.0 5 votes vote down vote up
def cut_word_stand(sentence):
    """
    词语全部转为小写、分词、去掉标点符号和去掉停用词
    :param sentence:
    :return:list(word)
    """
    stopwords = [word.strip('\n') for word in open('stopwords.txt').readlines()]
    cut_words = nltk.tokenize.WordPunctTokenizer().tokenize(sentence.lower())
    stemmer = nltk.PorterStemmer()
    # words = [stemmer.stem(word) for word in cut_words if word.isalnum() and word not in stopwords]
    words = [word for word in cut_words if word.isalnum() and word not in stopwords]
    return words 
Example 9
Project: eKoNLPy   Author: entelecheia   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self._stemmer = nltk.PorterStemmer()
        self._stopset = self.get_stopset() 
Example 10
Project: corpus-to-graph-ml   Author: CatalystCode   File: data_preparation_tools.py    MIT License 5 votes vote down vote up
def stem_text(sent, context=None):
    processed_tokens = []
    tokens = nltk.word_tokenize(sent)
    porter = nltk.PorterStemmer()
    for t in tokens:
        t = porter.stem(t)
        processed_tokens.append(t)

    return " ".join(processed_tokens)

# Split to train and test sample sets: 
Example 11
Project: category-based-classification   Author: MKLab-ITI   File: OobFusion_2D.py    Apache License 2.0 5 votes vote down vote up
def mytokenizer(x):
    # Tokenize sentence and return stemmed words
    #
    stemmed_list = list()
    for y in x.split():
        y_s = PorterStemmer().stem_word(y)
        if len(y_s) > 2:
            stemmed_list.append(y_s)
    return stemmed_list 
Example 12
Project: category-based-classification   Author: MKLab-ITI   File: OobFusion_2D.py    Apache License 2.0 5 votes vote down vote up
def readstopwords(file):
    # Read list of stopwords from file (one stopword per line)
    #
    stopwords = list()
    fin = open(file,"r")
    for line in fin:
        stopwords.append(PorterStemmer().stem_word(line.strip()))
    return stopwords 
Example 13
Project: predictive-models-in-prod   Author: fwhigh   File: feature_engineering.py    MIT License 5 votes vote down vote up
def Tokenizer(str_input):
    import nltk
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words 
Example 14
Project: gender-bias   Author: gender-bias   File: document.py    MIT License 5 votes vote down vote up
def stemmed_words(self) -> List:
        """
        Compute the stems of words.

        Uses nltk.PorterStemmer.

        Returns:
            List

        """
        words = self.words()
        porter = nltk.PorterStemmer()
        return [porter.stem(w) for w in words] 
Example 15
Project: freesound-datasets   Author: MTG   File: utils.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def stem(word):
    ps = PorterStemmer()
    return ps.stem(word) 
Example 16
Project: qa-scrapers   Author: collab-uniba   File: discretizer.py    MIT License 5 votes vote down vote up
def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
        LL = 0
        if answer_text is not '':
            tokens = word_tokenize(str(answer_text), language='english')
            porter_stemmer = PorterStemmer()
            unique_wordcount = len(stemmed_vocabulary)
            """
            per ogni w unica print_function words
                Cw = conta w in answer_text
                PwM = self.distrib_matrix[stemmer(w)]
                unique_wordcount = len(tokenize(answer_text)
            """
            for w in tokens:
                _w = w.strip().lower()
                Cw = 0
                for _ in answer_text.split():
                    if _w == _.strip().lower():
                        Cw += 1

                try:
                    w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
                except AttributeError:
                    w_stem = porter_stemmer.stem(_w)
                try:
                    PwM = distrib_matrix[w_stem]
                except KeyError:  # key error means frequency is equal to cutoff point 1
                    PwM = 1
                LL += (Cw * log(float(PwM)))

            try:
                LL = "{0:.2f}".format(LL / float(unique_wordcount))
            except ZeroDivisionError:
                LL = 0 

        return LL 
Example 17
Project: qa-scrapers   Author: collab-uniba   File: discretization.py    MIT License 5 votes vote down vote up
def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
        LL = 0
        if answer_text is not '':
            tokens = word_tokenize(str(answer_text), language='english')
            porter_stemmer = PorterStemmer()
            unique_wordcount = len(stemmed_vocabulary)
            """
            per ogni w unica print_function words
                Cw = conta w in answer_text
                PwM = self.distrib_matrix[stemmer(w)]
                unique_wordcount = len(tokenize(answer_text)
            """
            for w in tokens:
                _w = w.strip().lower()
                Cw = 0
                for _ in answer_text.split():
                    if _w == _.strip().lower():
                        Cw += 1

                try:
                    w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
                except AttributeError:
                    w_stem = porter_stemmer.stem(_w)
                try:
                    PwM = distrib_matrix[w_stem]
                except KeyError:  # key error means frequency is equal to cutoff point 1
                    PwM = 1
                LL += (Cw * log(float(PwM)))

            try:
                LL = "{0:.2f}".format(LL / float(unique_wordcount))
            except ZeroDivisionError:
                LL = 0 

        return LL 
Example 18
Project: Weiss   Author: WangWenjun559   File: reranker.py    Apache License 2.0 5 votes vote down vote up
def stemming(sentences):
    porter = nltk.PorterStemmer()
    stemmed_sentences = []
    for sentence in sentences:
        tokens = nltk.word_tokenize(str(sentence.sentence))
        stemmed_tokens = [porter.stem(t) for t in tokens]
        stemmed_sentences.append(' '.join(stemmed_tokens))
    return stemmed_sentences 
Example 19
Project: rejection-qa   Author: becxer   File: text_utils.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {} 
Example 20
Project: propara   Author: allenai   File: eval.py    Apache License 2.0 5 votes vote down vote up
def stem(cls, w: str):
        if not w or len(w.strip()) == 0:
            return ""
        w_lower = w.lower()
        # Remove leading articles from the phrase (e.g., the rays => rays).
        # FIXME: change this logic to accept a list of leading articles.
        if w_lower.startswith("a "):
            w_lower = w_lower[2:]
        elif w_lower.startswith("an "):
            w_lower = w_lower[3:]
        elif w_lower.startswith("the "):
            w_lower = w_lower[4:]
        elif w_lower.startswith("your "):
            w_lower = w_lower[5:]
        elif w_lower.startswith("his "):
            w_lower = w_lower[4:]
        elif w_lower.startswith("their "):
            w_lower = w_lower[6:]
        elif w_lower.startswith("my "):
            w_lower = w_lower[3:]
        elif w_lower.startswith("another "):
            w_lower = w_lower[8:]
        elif w_lower.startswith("other "):
            w_lower = w_lower[6:]
        elif w_lower.startswith("this "):
            w_lower = w_lower[5:]
        elif w_lower.startswith("that "):
            w_lower = w_lower[5:]
        # Porter stemmer: rays => ray
        return PorterStemmer().stem(w_lower).strip() 
Example 21
Project: NINJAS   Author: Beerstorm   File: Query.py    GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, corpus, requete=""):
        """ Crée un dictionnaire des mots apparaissant dans la requête.
          On a donc une sous-partie du dictionnaire de l'index (Donc avec la
          même structure).
          La requete est récupérée sous forme d'une chaine de caractères.

          On créé également un ensemble de solutions pour la requête qui est
          consultable via : unknown_function()
        """
        self._table = dict()
        self._not_table = set()
        self._req = ""
        # Table de hashage avec en clef : le nom du document solution et en
        #  valeur la note qui est attribuées à la solutions.
        self._solutions = dict()

        self._index_size = len(corpus.docs)
        # Suppresion de symbols de ponctuation dans la requête
        self._blank_signs   = re.compile("[-]")
        # Chargement des stopwords
        self._stopwords = stopwords.words('english')
        # Instanciation du stemmer
        self._stemmer = PorterStemmer()
        if requete:
            self._load(index, requete)


    #
    # Private Functions
    # 
Example 22
Project: topic-modelling-tools   Author: alan-turing-institute   File: bow.py    MIT License 5 votes vote down vote up
def pos_count(self, items):

        if items == "tokens":
                return self.dict_count(bow_data.pos_dict)

        elif items == "stems":

            pos_stems = [PorterStemmer().stem(t) for t in
                         bow_data.pos_dict.intersection(set(self.token_key.
                                                            keys()))]
            return self.dict_count(pos_stems)

        else:
            raise ValueError("Items must be either \'tokens\' or \'stems\'.") 
Example 23
Project: topic-modelling-tools   Author: alan-turing-institute   File: bow.py    MIT License 5 votes vote down vote up
def neg_count(self, items):

        if items == "tokens":
            return self.dict_count(bow_data.neg_dict)

        elif items == "stems":

            neg_stems = [PorterStemmer().stem(t) for t in
                         bow_data.neg_dict.intersection(set(self.token_key.
                                                            keys()))]
            return self.dict_count(neg_stems)

        else:
            raise ValueError("Items must be either \'tokens\' or \'stems\'.") 
Example 24
Project: topic-modelling-tools   Author: alan-turing-institute   File: bow.py    MIT License 5 votes vote down vote up
def uncertain_count(self, items):

        if items == "tokens":
            return self.dict_count(bow_data.uncertain_dict)

        elif items == "stems":

            uncertain_stems = [PorterStemmer().stem(t) for t in
                               bow_data.uncertain_dict.intersection(
                                   set(self.token_key.keys())
                               )]
            return self.dict_count(uncertain_stems)

        else:
            raise ValueError("Items must be either \'tokens\' or \'stems\'.") 
Example 25
Project: topic-modelling-tools   Author: alan-turing-institute   File: preprocess.py    MIT License 5 votes vote down vote up
def stem(self):

        """
        Stem tokens with Porter Stemmer.
        """

        def s(tokens):
            return [PorterStemmer().stem(t) for t in tokens]
        self.stems = list(map(s, self.tokens)) 
Example 26
Project: Quora   Author: KevinLiao159   File: nlp.py    MIT License 5 votes vote down vote up
def stemming(tokens):
    """
    stem tokens
    """
    porter = nltk.PorterStemmer()
    return [porter.stem(t) for t in tokens] 
Example 27
Project: Sarcasm-Detection   Author: MirunaPislar   File: extract_statistical_features.py    MIT License 5 votes vote down vote up
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False):
    if len(n) < 1:
        return {}
    if not for_semantics:
        if stem:
            porter = PorterStemmer()
            tokens = [porter.stem(t.lower()) for t in tokens]
        if use_just_words:
            tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#')
                      and t not in string.punctuation]
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = 'gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 18 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features) 
Example 28
Project: tagbot   Author: emre   File: utils.py    MIT License 5 votes vote down vote up
def tokenize(text):
    porter = PorterStemmer()
    words = wordpunct_tokenize(text)
    stemmed = [porter.stem(word) for word in words]
    return [
        word.lower()
        for word in stemmed
        if word.isalpha()
        and len(word) > 2
    ] 
Example 29
Project: sentiment_analysis   Author: samzek   File: Preprocessing.py    Apache License 2.0 5 votes vote down vote up
def Preprocess(tweet):

    #tokenize
    tokens = nltk.word_tokenize(tweet)
    wnl = nltk.WordNetLemmatizer()

    #puntcation removal
    for i in tokens:
        if i in {u'.',u',',u';',u':',u'!',u'?'}:
            tokens.remove(i)

    #tagging
    tokens = nltk.pos_tag(tokens)

    #stopwords removal
    tokens_no_stop = []

    for t,part in tokens:
        if not t in stopwords.words('english'):
          tokens_no_stop.append((wnl.lemmatize(t),part))


    tokens_stemmed = []

    porter = nltk.PorterStemmer()
    for t,part in tokens_no_stop:
        tokens_stemmed.append((porter.stem(t),part))

    return tokens_no_stop, tokens_stemmed 
Example 30
Project: Enrich   Author: Somsubhra   File: df_classifier.py    MIT License 5 votes vote down vote up
def is_difficult(self, word):
        if word.isdigit():
            return False

        sanitized_word = ''.join(e for e in word if e.isalnum()).lower()
        stemmed_word = PorterStemmer().stem_word(sanitized_word)

        if stemmed_word in self.document_frequencies:
            return self.document_frequencies[stemmed_word] < self.avg_df * 1
        else:
            return True 
Example 31
Project: Enrich   Author: Somsubhra   File: kff_classifier.py    MIT License 5 votes vote down vote up
def is_difficult(self, word):
        if word.isdigit():
            return False

        sanitized_word = ''.join(e for e in word if e.isalnum()).lower()
        stemmed_word = PorterStemmer().stem_word(sanitized_word)

        if stemmed_word in self.kf_frequencies:
            return self.kf_frequencies[stemmed_word] == 0
        else:
            return True 
Example 32
Project: Enrich   Author: Somsubhra   File: syllables_classifier.py    MIT License 5 votes vote down vote up
def is_difficult(self, word):
        if word.isdigit():
            return False

        sanitized_word = ''.join(e for e in word if e.isalnum()).lower()
        stemmed_word = PorterStemmer().stem_word(sanitized_word)

        if stemmed_word in self.syllables:
            return self.syllables[stemmed_word] > 2
        else:
            return True 
Example 33
Project: Enrich   Author: Somsubhra   File: itfidf_classifier.py    MIT License 5 votes vote down vote up
def is_difficult(self, word):
        if word.isdigit():
            return False

        sanitized_word = ''.join(e for e in word if e.isalnum()).lower()
        stemmed_word = PorterStemmer().stem_word(sanitized_word)

        if stemmed_word in self.itfidf_dictionary:
            return self.itfidf_dictionary[stemmed_word] > self.avg_itfidf / 16
        else:
            return True 
Example 34
Project: Enrich   Author: Somsubhra   File: tf_classifier.py    MIT License 5 votes vote down vote up
def is_difficult(self, word):
        if word.isdigit():
            return False

        sanitized_word = ''.join(e for e in word if e.isalnum()).lower()
        stemmed_word = PorterStemmer().stem_word(sanitized_word)

        if stemmed_word in self.term_frequencies:
            return self.term_frequencies[stemmed_word] < self.avg_tf / 8
        else:
            return True 
Example 35
Project: Enrich   Author: Somsubhra   File: psycholinguistic_db_creator.py    MIT License 5 votes vote down vote up
def create(self):
        Logger.log_message('Creating psycholinguistic dictionary database')

        input_file = open(self.in_file, 'r')
        output_file = open(self.out_file, 'w')

        for line in input_file.readlines():
            items = line.split()
            word = PorterStemmer().stem_word(items[2].lower())
            kff = items[1]
            syl = items[0]

            if word in self.kf_frequencies:
                # Select the stemmed word with the maximum KF Frequency
                if kff > self.kf_frequencies[word]:
                    self.kf_frequencies[word] = kff
            else:
                self.kf_frequencies[word] = kff

            if word in self.syllables:
                # Select the stemmed word with minimum number of syllables
                if syl < self.syllables[word]:
                    self.syllables[word] = syl
            else:
                self.syllables[word] = syl

        # Dump the contents to the output file
        for word in self.kf_frequencies:
            output_file.write(word + ";" + self.kf_frequencies[word] + ";" + self.syllables[word] + "\n")

        input_file.close()
        output_file.close()

        Logger.log_success('Created psycholinguistic dictionary database') 
Example 36
Project: NLTK-JSON-NLP   Author: dcavar   File: __init__.py    Apache License 2.0 5 votes vote down vote up
def get_stemmer():
    # https://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization
    return PorterStemmer().stem 
Example 37
Project: incubator-mxnet-ci   Author: apache   File: SentenceParser.py    Apache License 2.0 5 votes vote down vote up
def __init__(self):
        """
        SentenceParser serves to clean text content
        """
        self.data = None
        # extract words stem
        self.porter = nltk.PorterStemmer()
        # a set of stopwords
        self.stops = set(self.stopwords) 
Example 38
Project: Wiki_Semantic_Intention   Author: diyiy   File: wiki_edit_util.py    MIT License 5 votes vote down vote up
def stem_words(segments):
	tokens = []
	for s in segments:
		ss = s.lower().split()
		tokens.extend(ss)
	ans = set()
	for t in tokens:
		stem_t = PorterStemmer().stem_word(t)
		ans.add(stem_t)
	return ans 
Example 39
Project: vulnerability-prediction   Author: aqd14   File: text_cleanup.py    Apache License 2.0 5 votes vote down vote up
def stemWords(word):
    x=PorterStemmer().stem(word)
    return x 
Example 40
Project: foodkg.github.io   Author: foodkg   File: parse.py    Apache License 2.0 4 votes vote down vote up
def read_name(ingredient, high_quality):

    if high_quality:
        parts = ingredient.split(",")

        kept = []

        for part in parts:
            count = 0
            tagged = nltk.pos_tag(nltk.word_tokenize(part))
            for tag in tagged:
                if "NN" in tag[1]:
                    count += 1
            if count > 0:
                kept.append(part)

        if len(kept) == 0:
            return ""

        ingredient = kept[0]

        tagged = nltk.pos_tag(nltk.word_tokenize(ingredient))

        # remove anything after a conjunction

        for x in range(len(tagged)):
            if tagged[x][1] == "CC":
                tagged = tagged[0:x]
                break

        tagged = list(
            filter(
                lambda x: (("RB" not in x[1] and x[1] != "JJ" and x[1][0] != "V"))
                or x[0].lower() in webcolors.CSS3_NAMES_TO_HEX,
                tagged,
            )
        )

        words = list(map(lambda x: x[0], tagged))

        p = nltk.PorterStemmer()
        w = nltk.WordNetLemmatizer()

        words = map(lambda x: w.lemmatize(x), words)
        return " ".join(words)
    else:
        return ingredient.split(",")[0] 
Example 41
Project: summary-reward-no-reference   Author: yg211   File: step1_encode_doc_summ.py    Apache License 2.0 4 votes vote down vote up
def encode_doc_summ(stem=False, remove_stop=False):
    bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    bert_model = BertModel.from_pretrained('bert-large-uncased')

    sorted_scores = read_sorted_scores()
    input_articles, _ = read_articles()

    stopwords_list = set(stopwords.words("english"))
    stemmer = PorterStemmer()
    vec_dic = {}

    for i, (article_id, scores_list) in tqdm(enumerate(sorted_scores.items())):
        vec_dic[article_id] = {}
        article = [entry['article'] for entry in input_articles if entry['id']==article_id][0]
        ref_summ = scores_list[0]['ref']

        if stem and remove_stop:
            sys_summs = [" ".join(sent2stokens_wostop(s['sys_summ'], stemmer, stopwords_list, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2stokens_wostop(ref_summ, stemmer, stopwords_list, 'english', True))
            article = " ".join(sent2stokens_wostop(article, stemmer, stopwords_list, 'english', True))
        elif not stem and remove_stop:
            sys_summs = [" ".join(sent2tokens_wostop(s['sys_summ'], stopwords_list, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2tokens_wostop(ref_summ, stopwords_list, 'english', True))
            article = " ".join(sent2tokens_wostop(article, stopwords_list, 'english', True))
        elif not remove_stop and stem:
            sys_summs = [" ".join(sent2stokens(s['sys_summ'], stemmer, 'english', True)) for s in
                         scores_list]
            ref_summ = " ".join(sent2stokens(ref_summ, stemmer, 'english', True))
            article = " ".join(sent2stokens(article, stemmer, 'english', True))
        else:
            sys_summs = [s['sys_summ'] for s in scores_list]

        summ_ids = [s['summ_id'] for s in scores_list]

        # clean text
        sys_summs = [text_normalization(s) for s in sys_summs]
        ref_summ = text_normalization(ref_summ)
        article = text_normalization(article)

        vec_dic[article_id]['article'] = raw_bert_encoder(bert_model, bert_tokenizer, [article])
        vec_dic[article_id]['ref'] = raw_bert_encoder(bert_model, bert_tokenizer, [ref_summ])
        for i,sid in enumerate(summ_ids):
            vec_dic[article_id]['sys_summ{}'.format(sid)] = raw_bert_encoder(bert_model, bert_tokenizer, [sys_summs[i]])

    save_file_name = 'doc_summ_bert_vectors'
    if stem: save_file_name+'_stem'
    if remove_stop: save_file_name+'_removeStop'
    save_file_name += '.pkl'
    with open('data/'+save_file_name,'wb') as ff:
        pickle.dump(vec_dic,ff)