Python nltk.sent_tokenize() Examples

The following are 30 code examples for showing how to use nltk.sent_tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: dl-models-for-qa   Author: sujitpal   File: kaggle.py    License: Apache License 2.0 6 votes vote down vote up
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples 
Example 2
Project: self-attentive-parser   Author: nikitakit   File: nltk_plugin.py    License: MIT License 6 votes vote down vote up
def parse_sents(self, sents):
        """
        Parse multiple sentences

        If "sents" is a string, it will be segmented into sentences using NLTK.
        Otherwise, each element of "sents" will be treated as a sentence.

        sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse

        Returns: Iter[nltk.Tree]
        """
        if isinstance(sents, STRING_TYPES):
            if self._tokenizer_lang is None:
                raise ValueError(
                    "No tokenizer available for this language. "
                    "Please split into individual sentences and tokens "
                    "before calling the parser."
                    )
            sents = nltk.sent_tokenize(sents, self._tokenizer_lang)

        for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)):
            yield self._make_nltk_tree(sentence, tags_raw, *parse_raw) 
Example 3
Project: LSTM-CRF-models   Author: abhyudaynj   File: extract_data.py    License: MIT License 6 votes vote down vote up
def prepareSents(wrds):
    valid_sents=[]
    text=''.join(wrd[0] for wrd in wrds)
    sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)]
    text=[word for word in wrds if word[0]!=' ']
    sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list]
    idx=0
    s_idx=0
    while idx < len(text) and s_idx<len(sent_list):
        if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]):
            print "NLTK:"+ str(sent_list[s_idx])
            print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])])
        else:
            valid_sents+=[text[idx:idx+len(sent_list[s_idx])]]
        idx=idx+len(sent_list[s_idx])
        s_idx+=1
    return valid_sents 
Example 4
Project: partisan-discourse   Author: DistrictDataLabs   File: nlp.py    License: Apache License 2.0 6 votes vote down vote up
def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e))) 
Example 5
Project: lexpredict-contraxsuite   Author: LexPredict   File: custom.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list 
Example 6
Project: neural_chat   Author: natashamjaques   File: reddit_utils.py    License: MIT License 6 votes vote down vote up
def clean_thread_conversations(sub_str):
    conversations = []
    for mon in ['07', '08', '09', '10', '11', '12']:
        with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f:
            data = json.load(f)

        for thread in data:
            new_convo = {}
            new_convo['lines'] = []
            speaker = 0
            for msg in thread:
                text = clean_post(msg['text'])
                if len(text) > 1:
                    sentences = nltk.sent_tokenize(text)
                    for sent in sentences:
                        sent_dict = {}
                        sent_dict['character'] = speaker
                        sent_dict['text'] = sent
                        new_convo['lines'].append(sent_dict)
                    speaker = 1 - speaker
            if len(new_convo['lines']) > 1:
                conversations.append(new_convo)
    return conversations 
Example 7
Project: Valx   Author: Tony-Hao   File: sentence.py    License: GNU General Public License v3.0 6 votes vote down vote up
def sentence_splitting (texts, slen = 1):
	if len(texts) <= 0:
		return []
	
	# splitting
	sentences = []
	text_sents = sent_tokenize(texts)
	if (text_sents != [''] and len(text_sents) >  0):
		for sent in text_sents:
			sent = sent.strip().split('\r') # split strings that contains "\r"
			for sen in sent:
				se = sen.split('. ')
				for s in se: 
					if (NLP_word.words_counting(s) >= slen):
						sentences.append(s)

	return sentences


# splitting text into Sentences using NLTK tokenization 
Example 8
Project: coling2018_fake-news-challenge   Author: UKPLab   File: readability_indices.py    License: Apache License 2.0 6 votes vote down vote up
def flesch_kincaid_reading_ease(text, token_count):
    """
    Takes a text and returns its FK Reading Ease
    :param text: A string text
    :param token_count: the number of tokens in the text
    :return: FK Reading Ease
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)

    def avg_syllables_per_word(text, token_count):
        syllable = syllable_count(text)
        if token_count > 0:
            return float(syllable) / float(token_count)
        else:
            return 0

    if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0:
        return 0

    ASL = float(token_count / len(nltk.sent_tokenize(text)))  # avg sentence length
    ASW = avg_syllables_per_word(text, token_count)
    FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW)
    return FKRA 
Example 9
Project: atap   Author: foxbook   File: gender.py    License: Apache License 2.0 6 votes vote down vote up
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        ) 
Example 10
Project: tika-similarity   Author: chrismattmann   File: psykey.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, text, wordlistfolder):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.sentenses = nltk.sent_tokenize(text)
        self.tags = nltk.pos_tag(self.tokens)

        self.featspace = []

        self.psykfeatspace(self.featspace, wordlistfolder)
        self.bigrams(self.featspace)
        self.number_count(self.featspace)
        self.punc_count(self.featspace)
        self.big_word_count(self.featspace)
        self.words_per_sentence(self.featspace)
        self.sentence_count(self.featspace)
        self.countPOS(self.featspace, 'CC')
        self.countPOS(self.featspace, 'NP')
        self.countPOS(self.featspace, 'NNP')
        self.words(self.featspace)
        self.stem(self.featspace)

    # Counts a specific POS tags 
Example 11
Project: pliers   Author: tyarkoni   File: text.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _from_text(self, text, unit, tokenizer, language):

        if tokenizer is not None:
            if isinstance(tokenizer, str):
                tokens = re.findall(tokenizer, text)
            else:
                tokens = tokenizer.tokenize(text)
        else:
            import nltk

            @requires_nltk_corpus
            def tokenize_text(text):
                if unit == 'word':
                    return nltk.word_tokenize(text, language)
                elif unit.startswith('sent'):
                    return nltk.sent_tokenize(text, language)
                else:
                    raise ValueError(
                        "unit must be either 'word' or 'sentence'")

            tokens = tokenize_text(text)

        for i, t in enumerate(tokens):
            self._elements.append(TextStim(text=t, onset=None, duration=None,
                                  order=i)) 
Example 12
Project: ace2005-preprocessing   Author: nlpcl-lab   File: parser.py    License: MIT License 5 votes vote down vote up
def parse_sgm(self, sgm_path):
        with open(sgm_path, 'r') as f:
            soup = BeautifulSoup(f.read(), features='html.parser')
            self.sgm_text = soup.text

            doc_type = soup.doc.doctype.text.strip()

            def remove_tags(selector):
                tags = soup.findAll(selector)
                for tag in tags:
                    tag.extract()

            if doc_type == 'WEB TEXT':
                remove_tags('poster')
                remove_tags('postdate')
                remove_tags('subject')
            elif doc_type in ['CONVERSATION', 'STORY']:
                remove_tags('speaker')

            sents = []
            converted_text = soup.text

            for sent in nltk.sent_tokenize(converted_text):
                sents.extend(sent.split('\n\n'))
            sents = list(filter(lambda x: len(x) > 5, sents))
            sents = sents[1:]
            sents_with_pos = []
            last_pos = 0
            for sent in sents:
                pos = self.sgm_text.find(sent, last_pos)
                last_pos = pos
                sents_with_pos.append({
                    'text': sent,
                    'position': [pos, pos + len(sent)]
                })

            return sents_with_pos 
Example 13
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    License: MIT License 5 votes vote down vote up
def tokenize_sentences(targets):
    while True:
        text = (yield)  # (yield) gets an item from an upstream step
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            for target in targets:
                target.send(sentence)  # send() sends data downstream 
Example 14
Project: qb   Author: Pinafore   File: cached_wikipedia.py    License: MIT License 5 votes vote down vote up
def extract_wiki_sentences(title, text, n_sentences, replace_title_mentions=''):
    """
    Extracts the first n_paragraphs from the text of a wikipedia page corresponding to the title.
    strip_title_mentions and replace_title_mentions control handling of references to the title in text.
    Oftentimes QA models learn *not* to answer entities mentioned in the question so this helps deal with this
    in the domain adaptation case.

    :param title: title of page
    :param text: text of page
    :param n_paragraphs: number of paragraphs to use
    :param replace_title_mentions: Replace mentions with the provided string token, by default removing them
    :return:
    """
    # Get simplest representation of title and text
    title = unidecode(title).replace('_', ' ')
    text = unidecode(text)

    # Split on non-alphanumeric
    title_words = re.split('[^a-zA-Z0-9]', title)
    title_word_pattern = '|'.join(re.escape(w.lower()) for w in title_words)

    # Breaking by newline yields paragraphs. Ignore the first since its always just the title
    paragraphs = [p for p in text.split('\n') if len(p) != 0][1:]
    sentences = []
    for p in paragraphs:
        formatted_text = re.sub(title_word_pattern, replace_title_mentions, p, flags=re.IGNORECASE)
        # Cleanup whitespace
        formatted_text = re.sub('\s+', ' ', formatted_text).strip()

        sentences.extend(nltk.sent_tokenize(formatted_text))

    return sentences[:n_sentences] 
Example 15
Project: EliIE   Author: Tian312   File: word2vec.py    License: MIT License 5 votes vote down vote up
def tokenize_train(train_directory,tokenized_directory):
    with codecs.open(train_directory, "r", "utf-8") as file:
	    with codecs.open(tokenized_directory, "w", "utf-8") as writer:
		    new_sens = []
		    for line in file:
			    sentences = sent_tokenize(line.strip())
			    for sen in sentences:

				    sen = word_tokenize(sen.lower())
				    new_sen = ' '.join(sen)
				    new_sens.append(new_sen)
				    writer.write(new_sen)
				    writer.write("\n")
    sentences = gensim.models.word2vec.LineSentence(tokenized_directory)
    return sentences 
Example 16
Project: EliIE   Author: Tian312   File: retrieve_texts.py    License: MIT License 5 votes vote down vote up
def sentence_splitting (texts, slen = 1):           # Split ec into seperated sentences.
    if len(texts) <= 0:
        return []

    # splitting
    sentences = []
    text_sents = nltk.sent_tokenize(texts)
    if (text_sents != [''] and len(text_sents) >  0):
        for sent in text_sents:
            sent=re.sub('e.g.','eg',sent)
            sent = sent.strip().split('\r') # split strings that contains "\r"
            for sen in sent:
                se = re.split('[.;]',sen)

                for s in se:
                    ss=s.split('-  ')
                    for final in ss:
                        #print final

                        match=re.match('^\d+\.\s*$',final)
                        if match:
                            continue
                        final=re.sub('\s+$','',final)
                        final=re.sub('\d+\.','',final)
                        final=final.encode('utf-8').decode('utf-8','ignore').encode("utf-8")
                        words=final.decode('ascii', 'ignore').split(' ')
                        new_words=[]
                        for w in words:
                            if w:
                                #print "=="+w+"=="
                                match=re.search('(\(*\w+\)*,*.*)',w)
                                if match:
                                    #print match.group(1)
                                    new_words.append(match.group(1))
                        new_sent=' '.join(new_words)
                        if new_sent:
                            sentences.append(new_sent)
                            #print new_sent


    return sentences 
Example 17
Project: chowmein   Author: xiaohan2012   File: data.py    License: MIT License 5 votes vote down vote up
def load_line_corpus(path, tokenize=True):
    docs = []
    with codecs.open(path, "r", "utf8") as f:
        for l in f:
            if tokenize:
                sents = nltk.sent_tokenize(l.strip().lower())
                docs.append(list(itertools.chain(*map(
                    nltk.word_tokenize, sents))))
            else:
                docs.append(l.strip())
    return docs 
Example 18
Project: gender-bias   Author: gender-bias   File: document.py    License: MIT License 5 votes vote down vote up
def sentences(self) -> List[str]:
        """
        Compute a list of sentences.

        Uses nltk.sent_tokenize.

        Returns:
            List[str]

        """
        return [s.replace("\n", " ") for s in nltk.sent_tokenize(self._text)] 
Example 19
Project: SOQAL   Author: husseinmozannar   File: random_reader.py    License: MIT License 5 votes vote down vote up
def get_answer_canditates(self, paragraph):
        para_sents = nltk.sent_tokenize(paragraph)
        candidates = []
        for sent in para_sents:
            para_words = sent.split()
            for i in range(0, len(para_words)):
                for j in range(1, min(15, len(para_words) - i + 1)):
                    candidate = self.concatenateString(para_words, i, j)
                    candidates.append(candidate)
        return candidates 
Example 20
Project: SOQAL   Author: husseinmozannar   File: evaluate_baselines.py    License: MIT License 5 votes vote down vote up
def evaluate(dataset, reader):
    f1 = exact_match = total = exact_sentence = inclusion = random = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = reader.read(paragraph['context'], qa['question'])
                sents = nltk.sent_tokenize(paragraph['context'])
                indx_g = -1
                indx_p = -1
                i = 0
                for sent in sents:
                    if sent.find(ground_truths[0]) != -1:
                        indx_g = i
                    if sent.find(prediction) != -1:
                        indx_p = i
                    i += 1
                test = randint(0, i)
                if test == indx_g:
                    random += 1
                if prediction.find(ground_truths[0]) != -1 or ground_truths[0].find(prediction):
                    inclusion += 1
                if indx_g == indx_p and indx_p != -1:
                    exact_sentence += 1
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)
    inclusion = 100 * inclusion / total
    random = 100 * random / total
    exact_sentence = 100 * exact_sentence / total
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1, 'exact_sentence': exact_sentence, 'inclusion': inclusion,
            'random': random} 
Example 21
Project: SOQAL   Author: husseinmozannar   File: embedding_match.py    License: MIT License 5 votes vote down vote up
def get_answer_canditates(self, paragraph):
        para_sents = nltk.sent_tokenize(paragraph)
        candidates = []
        for sent in para_sents:
            para_words = sent.split()
            for i in range(0, len(para_words)):
                for j in range(1, min(15, len(para_words) - i + 1)):
                    candidate = self.concatenateString(para_words, i, j)
                    candidates.append(candidate)
        return candidates 
Example 22
Project: SOQAL   Author: husseinmozannar   File: tfidf_reader.py    License: MIT License 5 votes vote down vote up
def get_answer_canditates(self, paragraph):
        para_sents = nltk.sent_tokenize(paragraph)
        candidates = []
        for sent in para_sents:
            para_words = sent.split()
            for i in range(0, len(para_words)):
                for j in range(1, min(15, len(para_words) - i + 1)):
                    candidate = self.concatenateString(para_words, i, j)
                    candidates.append(candidate)
        return candidates 
Example 23
Project: SOQAL   Author: husseinmozannar   File: evaluate.py    License: MIT License 5 votes vote down vote up
def evaluate(dataset, predictions):
    f1 = exact_match = total = exact_sentence = inclusion = random = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                sents = nltk.sent_tokenize(paragraph['context'])
                indx_g = -1
                indx_p = -1
                i = 0
                for sent in sents:
                    if sent.find(ground_truths[0]) != -1:
                        indx_g = i
                    if sent.find(prediction) != -1:
                        indx_p = i
                    i += 1
                test = randint(0,i)
                if test == indx_g:
                    random += 1
                if prediction.find(ground_truths[0]) != -1 or ground_truths[0].find(prediction):
                    inclusion += 1
                if indx_g == indx_p and indx_p != -1:
                    exact_sentence += 1
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)
    inclusion = inclusion / total
    random = random / total
    exact_sentence = 100 * exact_sentence / total
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1, 'exact_sentence': exact_sentence} 
Example 24
Project: chimera   Author: AmitMY   File: tokens.py    License: MIT License 5 votes vote down vote up
def tokenize_sentences(text: str):
    text = re.sub(r" no\. ent_(\d)", r" shorthand_number ent_\1", text, flags=re.IGNORECASE)
    return [s.replace("shorthand_number", "no.") for s in sent_tokenize(text)] 
Example 25
def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data 
Example 26
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: nlp_countWords.py    License: GNU General Public License v2.0 5 votes vote down vote up
def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data 
Example 27
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: nlp_pos.py    License: GNU General Public License v2.0 5 votes vote down vote up
def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data 
Example 28
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: classify.py    License: MIT License 5 votes vote down vote up
def prepare_sent_features():
    for pid, text in fetch_posts(chosen, with_index=True):
        if not text:
            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
        else:
            sent_lens = [len(nltk.word_tokenize(
                sent)) for sent in nltk.sent_tokenize(text)]
            meta[pid]['AvgSentLen'] = np.mean(sent_lens)
            meta[pid]['AvgWordLen'] = np.mean(
                [len(w) for w in nltk.word_tokenize(text)])

        meta[pid]['NumAllCaps'] = np.sum(
            [word.isupper() for word in nltk.word_tokenize(text)])

        meta[pid]['NumExclams'] = text.count('!') 
Example 29
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.5 Skipgram_Keras.py    License: MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 30
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text