Python nltk.sent_tokenize() Examples
The following are 30 code examples for showing how to use nltk.sent_tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: dl-models-for-qa Author: sujitpal File: kaggle.py License: Apache License 2.0 | 6 votes |
def get_story_question_answer_triples(sqa_file): sqatriples = [] fsqa = open(sqa_file, "rb") for line in fsqa: line = line.strip().decode("utf8").encode("ascii", "ignore") if line.startswith("#"): continue story, question, answer, correct = line.split("\t") swords = [] story_sents = nltk.sent_tokenize(story) for story_sent in story_sents: swords.extend(nltk.word_tokenize(story_sent)) qwords = nltk.word_tokenize(question) awords = nltk.word_tokenize(answer) is_correct = int(correct) == 1 sqatriples.append((swords, qwords, awords, is_correct)) fsqa.close() return sqatriples
Example 2
Project: self-attentive-parser Author: nikitakit File: nltk_plugin.py License: MIT License | 6 votes |
def parse_sents(self, sents): """ Parse multiple sentences If "sents" is a string, it will be segmented into sentences using NLTK. Otherwise, each element of "sents" will be treated as a sentence. sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse Returns: Iter[nltk.Tree] """ if isinstance(sents, STRING_TYPES): if self._tokenizer_lang is None: raise ValueError( "No tokenizer available for this language. " "Please split into individual sentences and tokens " "before calling the parser." ) sents = nltk.sent_tokenize(sents, self._tokenizer_lang) for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)): yield self._make_nltk_tree(sentence, tags_raw, *parse_raw)
Example 3
Project: LSTM-CRF-models Author: abhyudaynj File: extract_data.py License: MIT License | 6 votes |
def prepareSents(wrds): valid_sents=[] text=''.join(wrd[0] for wrd in wrds) sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)] text=[word for word in wrds if word[0]!=' '] sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list] idx=0 s_idx=0 while idx < len(text) and s_idx<len(sent_list): if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]): print "NLTK:"+ str(sent_list[s_idx]) print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])]) else: valid_sents+=[text[idx:idx+len(sent_list[s_idx])]] idx=idx+len(sent_list[s_idx]) s_idx+=1 return valid_sents
Example 4
Project: partisan-discourse Author: DistrictDataLabs File: nlp.py License: Apache License 2.0 | 6 votes |
def preprocess(html): """ Returns a preprocessed document consisting of a list of paragraphs, which is a list of sentences, which is a list of tuples, where each tuple is a (token, part of speech) pair. """ try: return [ [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ] for paragraph in para_tokenize(html) ] except Exception as e: raise NLTKError("could not preprocess text: {}".format(str(e)))
Example 5
Project: lexpredict-contraxsuite Author: LexPredict File: custom.py License: GNU Affero General Public License v3.0 | 6 votes |
def extract_nnp_phrases(text): """ NNP extractor convenience method. :param text: :return: """ phrase_list = [] for sentence in nltk.sent_tokenize(text): # Get POS tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) # Get POS phrase = [] for t, p in pos: if p in ["NNP", "NNPS"] or t in [",", "&"]: phrase.append(t) else: if len(phrase) > 1: phrase_list.append(clean_nnp_phrase(phrase)) phrase = [] return phrase_list
Example 6
Project: neural_chat Author: natashamjaques File: reddit_utils.py License: MIT License | 6 votes |
def clean_thread_conversations(sub_str): conversations = [] for mon in ['07', '08', '09', '10', '11', '12']: with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f: data = json.load(f) for thread in data: new_convo = {} new_convo['lines'] = [] speaker = 0 for msg in thread: text = clean_post(msg['text']) if len(text) > 1: sentences = nltk.sent_tokenize(text) for sent in sentences: sent_dict = {} sent_dict['character'] = speaker sent_dict['text'] = sent new_convo['lines'].append(sent_dict) speaker = 1 - speaker if len(new_convo['lines']) > 1: conversations.append(new_convo) return conversations
Example 7
Project: Valx Author: Tony-Hao File: sentence.py License: GNU General Public License v3.0 | 6 votes |
def sentence_splitting (texts, slen = 1): if len(texts) <= 0: return [] # splitting sentences = [] text_sents = sent_tokenize(texts) if (text_sents != [''] and len(text_sents) > 0): for sent in text_sents: sent = sent.strip().split('\r') # split strings that contains "\r" for sen in sent: se = sen.split('. ') for s in se: if (NLP_word.words_counting(s) >= slen): sentences.append(s) return sentences # splitting text into Sentences using NLTK tokenization
Example 8
Project: coling2018_fake-news-challenge Author: UKPLab File: readability_indices.py License: Apache License 2.0 | 6 votes |
def flesch_kincaid_reading_ease(text, token_count): """ Takes a text and returns its FK Reading Ease :param text: A string text :param token_count: the number of tokens in the text :return: FK Reading Ease """ # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat) def avg_syllables_per_word(text, token_count): syllable = syllable_count(text) if token_count > 0: return float(syllable) / float(token_count) else: return 0 if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0: return 0 ASL = float(token_count / len(nltk.sent_tokenize(text))) # avg sentence length ASW = avg_syllables_per_word(text, token_count) FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW) return FKRA
Example 9
Project: atap Author: foxbook File: gender.py License: Apache License 2.0 | 6 votes |
def parse_gender(text): sentences = [ [word.lower() for word in nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(text) ] sents, words = count_gender(sentences) total = sum(words.values()) for gender, count in words.items(): pcent = (count / total) * 100 nsents = sents[gender] print( "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents) )
Example 10
Project: tika-similarity Author: chrismattmann File: psykey.py License: Apache License 2.0 | 6 votes |
def __init__(self, text, wordlistfolder): self.text = text self.tokens = nltk.word_tokenize(text) self.sentenses = nltk.sent_tokenize(text) self.tags = nltk.pos_tag(self.tokens) self.featspace = [] self.psykfeatspace(self.featspace, wordlistfolder) self.bigrams(self.featspace) self.number_count(self.featspace) self.punc_count(self.featspace) self.big_word_count(self.featspace) self.words_per_sentence(self.featspace) self.sentence_count(self.featspace) self.countPOS(self.featspace, 'CC') self.countPOS(self.featspace, 'NP') self.countPOS(self.featspace, 'NNP') self.words(self.featspace) self.stem(self.featspace) # Counts a specific POS tags
Example 11
Project: pliers Author: tyarkoni File: text.py License: BSD 3-Clause "New" or "Revised" License | 6 votes |
def _from_text(self, text, unit, tokenizer, language): if tokenizer is not None: if isinstance(tokenizer, str): tokens = re.findall(tokenizer, text) else: tokens = tokenizer.tokenize(text) else: import nltk @requires_nltk_corpus def tokenize_text(text): if unit == 'word': return nltk.word_tokenize(text, language) elif unit.startswith('sent'): return nltk.sent_tokenize(text, language) else: raise ValueError( "unit must be either 'word' or 'sentence'") tokens = tokenize_text(text) for i, t in enumerate(tokens): self._elements.append(TextStim(text=t, onset=None, duration=None, order=i))
Example 12
Project: ace2005-preprocessing Author: nlpcl-lab File: parser.py License: MIT License | 5 votes |
def parse_sgm(self, sgm_path): with open(sgm_path, 'r') as f: soup = BeautifulSoup(f.read(), features='html.parser') self.sgm_text = soup.text doc_type = soup.doc.doctype.text.strip() def remove_tags(selector): tags = soup.findAll(selector) for tag in tags: tag.extract() if doc_type == 'WEB TEXT': remove_tags('poster') remove_tags('postdate') remove_tags('subject') elif doc_type in ['CONVERSATION', 'STORY']: remove_tags('speaker') sents = [] converted_text = soup.text for sent in nltk.sent_tokenize(converted_text): sents.extend(sent.split('\n\n')) sents = list(filter(lambda x: len(x) > 5, sents)) sents = sents[1:] sents_with_pos = [] last_pos = 0 for sent in sents: pos = self.sgm_text.find(sent, last_pos) last_pos = pos sents_with_pos.append({ 'text': sent, 'position': [pos, pos + len(sent)] }) return sents_with_pos
Example 13
Project: Hands-on-NLP-with-NLTK-and-scikit-learn- Author: PacktPublishing File: nlp-6.1-nlp-pipeline.py License: MIT License | 5 votes |
def tokenize_sentences(targets): while True: text = (yield) # (yield) gets an item from an upstream step sentences = nltk.sent_tokenize(text) for sentence in sentences: for target in targets: target.send(sentence) # send() sends data downstream
Example 14
Project: qb Author: Pinafore File: cached_wikipedia.py License: MIT License | 5 votes |
def extract_wiki_sentences(title, text, n_sentences, replace_title_mentions=''): """ Extracts the first n_paragraphs from the text of a wikipedia page corresponding to the title. strip_title_mentions and replace_title_mentions control handling of references to the title in text. Oftentimes QA models learn *not* to answer entities mentioned in the question so this helps deal with this in the domain adaptation case. :param title: title of page :param text: text of page :param n_paragraphs: number of paragraphs to use :param replace_title_mentions: Replace mentions with the provided string token, by default removing them :return: """ # Get simplest representation of title and text title = unidecode(title).replace('_', ' ') text = unidecode(text) # Split on non-alphanumeric title_words = re.split('[^a-zA-Z0-9]', title) title_word_pattern = '|'.join(re.escape(w.lower()) for w in title_words) # Breaking by newline yields paragraphs. Ignore the first since its always just the title paragraphs = [p for p in text.split('\n') if len(p) != 0][1:] sentences = [] for p in paragraphs: formatted_text = re.sub(title_word_pattern, replace_title_mentions, p, flags=re.IGNORECASE) # Cleanup whitespace formatted_text = re.sub('\s+', ' ', formatted_text).strip() sentences.extend(nltk.sent_tokenize(formatted_text)) return sentences[:n_sentences]
Example 15
Project: EliIE Author: Tian312 File: word2vec.py License: MIT License | 5 votes |
def tokenize_train(train_directory,tokenized_directory): with codecs.open(train_directory, "r", "utf-8") as file: with codecs.open(tokenized_directory, "w", "utf-8") as writer: new_sens = [] for line in file: sentences = sent_tokenize(line.strip()) for sen in sentences: sen = word_tokenize(sen.lower()) new_sen = ' '.join(sen) new_sens.append(new_sen) writer.write(new_sen) writer.write("\n") sentences = gensim.models.word2vec.LineSentence(tokenized_directory) return sentences
Example 16
Project: EliIE Author: Tian312 File: retrieve_texts.py License: MIT License | 5 votes |
def sentence_splitting (texts, slen = 1): # Split ec into seperated sentences. if len(texts) <= 0: return [] # splitting sentences = [] text_sents = nltk.sent_tokenize(texts) if (text_sents != [''] and len(text_sents) > 0): for sent in text_sents: sent=re.sub('e.g.','eg',sent) sent = sent.strip().split('\r') # split strings that contains "\r" for sen in sent: se = re.split('[.;]',sen) for s in se: ss=s.split('- ') for final in ss: #print final match=re.match('^\d+\.\s*$',final) if match: continue final=re.sub('\s+$','',final) final=re.sub('\d+\.','',final) final=final.encode('utf-8').decode('utf-8','ignore').encode("utf-8") words=final.decode('ascii', 'ignore').split(' ') new_words=[] for w in words: if w: #print "=="+w+"==" match=re.search('(\(*\w+\)*,*.*)',w) if match: #print match.group(1) new_words.append(match.group(1)) new_sent=' '.join(new_words) if new_sent: sentences.append(new_sent) #print new_sent return sentences
Example 17
Project: chowmein Author: xiaohan2012 File: data.py License: MIT License | 5 votes |
def load_line_corpus(path, tokenize=True): docs = [] with codecs.open(path, "r", "utf8") as f: for l in f: if tokenize: sents = nltk.sent_tokenize(l.strip().lower()) docs.append(list(itertools.chain(*map( nltk.word_tokenize, sents)))) else: docs.append(l.strip()) return docs
Example 18
Project: gender-bias Author: gender-bias File: document.py License: MIT License | 5 votes |
def sentences(self) -> List[str]: """ Compute a list of sentences. Uses nltk.sent_tokenize. Returns: List[str] """ return [s.replace("\n", " ") for s in nltk.sent_tokenize(self._text)]
Example 19
Project: SOQAL Author: husseinmozannar File: random_reader.py License: MIT License | 5 votes |
def get_answer_canditates(self, paragraph): para_sents = nltk.sent_tokenize(paragraph) candidates = [] for sent in para_sents: para_words = sent.split() for i in range(0, len(para_words)): for j in range(1, min(15, len(para_words) - i + 1)): candidate = self.concatenateString(para_words, i, j) candidates.append(candidate) return candidates
Example 20
Project: SOQAL Author: husseinmozannar File: evaluate_baselines.py License: MIT License | 5 votes |
def evaluate(dataset, reader): f1 = exact_match = total = exact_sentence = inclusion = random = 0 for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: total += 1 ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = reader.read(paragraph['context'], qa['question']) sents = nltk.sent_tokenize(paragraph['context']) indx_g = -1 indx_p = -1 i = 0 for sent in sents: if sent.find(ground_truths[0]) != -1: indx_g = i if sent.find(prediction) != -1: indx_p = i i += 1 test = randint(0, i) if test == indx_g: random += 1 if prediction.find(ground_truths[0]) != -1 or ground_truths[0].find(prediction): inclusion += 1 if indx_g == indx_p and indx_p != -1: exact_sentence += 1 exact_match += metric_max_over_ground_truths( exact_match_score, prediction, ground_truths) f1 += metric_max_over_ground_truths( f1_score, prediction, ground_truths) inclusion = 100 * inclusion / total random = 100 * random / total exact_sentence = 100 * exact_sentence / total exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total return {'exact_match': exact_match, 'f1': f1, 'exact_sentence': exact_sentence, 'inclusion': inclusion, 'random': random}
Example 21
Project: SOQAL Author: husseinmozannar File: embedding_match.py License: MIT License | 5 votes |
def get_answer_canditates(self, paragraph): para_sents = nltk.sent_tokenize(paragraph) candidates = [] for sent in para_sents: para_words = sent.split() for i in range(0, len(para_words)): for j in range(1, min(15, len(para_words) - i + 1)): candidate = self.concatenateString(para_words, i, j) candidates.append(candidate) return candidates
Example 22
Project: SOQAL Author: husseinmozannar File: tfidf_reader.py License: MIT License | 5 votes |
def get_answer_canditates(self, paragraph): para_sents = nltk.sent_tokenize(paragraph) candidates = [] for sent in para_sents: para_words = sent.split() for i in range(0, len(para_words)): for j in range(1, min(15, len(para_words) - i + 1)): candidate = self.concatenateString(para_words, i, j) candidates.append(candidate) return candidates
Example 23
Project: SOQAL Author: husseinmozannar File: evaluate.py License: MIT License | 5 votes |
def evaluate(dataset, predictions): f1 = exact_match = total = exact_sentence = inclusion = random = 0 for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: total += 1 if qa['id'] not in predictions: message = 'Unanswered question ' + qa['id'] + \ ' will receive score 0.' print(message, file=sys.stderr) continue ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = predictions[qa['id']] sents = nltk.sent_tokenize(paragraph['context']) indx_g = -1 indx_p = -1 i = 0 for sent in sents: if sent.find(ground_truths[0]) != -1: indx_g = i if sent.find(prediction) != -1: indx_p = i i += 1 test = randint(0,i) if test == indx_g: random += 1 if prediction.find(ground_truths[0]) != -1 or ground_truths[0].find(prediction): inclusion += 1 if indx_g == indx_p and indx_p != -1: exact_sentence += 1 exact_match += metric_max_over_ground_truths( exact_match_score, prediction, ground_truths) f1 += metric_max_over_ground_truths( f1_score, prediction, ground_truths) inclusion = inclusion / total random = random / total exact_sentence = 100 * exact_sentence / total exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total return {'exact_match': exact_match, 'f1': f1, 'exact_sentence': exact_sentence}
Example 24
Project: chimera Author: AmitMY File: tokens.py License: MIT License | 5 votes |
def tokenize_sentences(text: str): text = re.sub(r" no\. ent_(\d)", r" shorthand_number ent_\1", text, flags=re.IGNORECASE) return [s.replace("shorthand_number", "no.") for s in sent_tokenize(text)]
Example 25
Project: practicalDataAnalysisCookbook Author: drabastomek File: nlp_pos_alternative.py License: GNU General Public License v2.0 | 5 votes |
def preprocess_data(text): global sentences, tokenized tokenizer = nltk.RegexpTokenizer(r'\w+') sentences = nltk.sent_tokenize(text) tokenized = [tokenizer.tokenize(s) for s in sentences] # import the data
Example 26
Project: practicalDataAnalysisCookbook Author: drabastomek File: nlp_countWords.py License: GNU General Public License v2.0 | 5 votes |
def preprocess_data(text): global sentences, tokenized tokenizer = nltk.RegexpTokenizer(r'\w+') sentences = nltk.sent_tokenize(text) tokenized = [tokenizer.tokenize(s) for s in sentences] # import the data
Example 27
Project: practicalDataAnalysisCookbook Author: drabastomek File: nlp_pos.py License: GNU General Public License v2.0 | 5 votes |
def preprocess_data(text): global sentences, tokenized tokenizer = nltk.RegexpTokenizer(r'\w+') sentences = nltk.sent_tokenize(text) tokenized = [tokenizer.tokenize(s) for s in sentences] # import the data
Example 28
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition Author: PacktPublishing File: classify.py License: MIT License | 5 votes |
def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean( [len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum( [word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')
Example 29
Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: 9.5 Skipgram_Keras.py License: MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example 30
Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: 9.2 Email_Classification.py License: MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() try: tokens = [stemmer.stem(word) for word in tokens] except: tokens = tokens tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text