Python nltk.sent_tokenize() Examples

The following are 30 code examples of nltk.sent_tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: kaggle.py    From dl-models-for-qa with Apache License 2.0 8 votes vote down vote up
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples 
Example #2
Source File: readability_indices.py    From coling2018_fake-news-challenge with Apache License 2.0 6 votes vote down vote up
def flesch_kincaid_reading_ease(text, token_count):
    """
    Takes a text and returns its FK Reading Ease
    :param text: A string text
    :param token_count: the number of tokens in the text
    :return: FK Reading Ease
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)

    def avg_syllables_per_word(text, token_count):
        syllable = syllable_count(text)
        if token_count > 0:
            return float(syllable) / float(token_count)
        else:
            return 0

    if len(nltk.sent_tokenize(text)) <= 0 or token_count <= 0:
        return 0

    ASL = float(token_count / len(nltk.sent_tokenize(text)))  # avg sentence length
    ASW = avg_syllables_per_word(text, token_count)
    FKRA = 206.835 - float(1.015 * ASL) - float(84.6 * ASW)
    return FKRA 
Example #3
Source File: extract_data.py    From LSTM-CRF-models with MIT License 6 votes vote down vote up
def prepareSents(wrds):
    valid_sents=[]
    text=''.join(wrd[0] for wrd in wrds)
    sent_list=[[(word,0,'None') for word in sent] for sent in sent_tokenize(text)]
    text=[word for word in wrds if word[0]!=' ']
    sent_list=[[word for word in concat_words(strip_chars(sent)) if word[0]!=' '] for sent in sent_list]
    idx=0
    s_idx=0
    while idx < len(text) and s_idx<len(sent_list):
        if not match_words(sent_list[s_idx],text[idx:idx+len(sent_list[s_idx])]):
            print "NLTK:"+ str(sent_list[s_idx])
            print 'MINE:' + str(text[idx:idx+len(sent_list[s_idx])])
        else:
            valid_sents+=[text[idx:idx+len(sent_list[s_idx])]]
        idx=idx+len(sent_list[s_idx])
        s_idx+=1
    return valid_sents 
Example #4
Source File: nlp.py    From partisan-discourse with Apache License 2.0 6 votes vote down vote up
def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e))) 
Example #5
Source File: gender.py    From atap with Apache License 2.0 6 votes vote down vote up
def parse_gender(text):

    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]

    sents, words = count_gender(sentences)
    total = sum(words.values())

    for gender, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[gender]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents)
        ) 
Example #6
Source File: custom.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 6 votes vote down vote up
def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list 
Example #7
Source File: reddit_utils.py    From neural_chat with MIT License 6 votes vote down vote up
def clean_thread_conversations(sub_str):
    conversations = []
    for mon in ['07', '08', '09', '10', '11', '12']:
        with open('datasets/raw_reddit/reddit_{}_{}_18threads.json'.format(sub_str, mon)) as f:
            data = json.load(f)

        for thread in data:
            new_convo = {}
            new_convo['lines'] = []
            speaker = 0
            for msg in thread:
                text = clean_post(msg['text'])
                if len(text) > 1:
                    sentences = nltk.sent_tokenize(text)
                    for sent in sentences:
                        sent_dict = {}
                        sent_dict['character'] = speaker
                        sent_dict['text'] = sent
                        new_convo['lines'].append(sent_dict)
                    speaker = 1 - speaker
            if len(new_convo['lines']) > 1:
                conversations.append(new_convo)
    return conversations 
Example #8
Source File: psykey.py    From tika-similarity with Apache License 2.0 6 votes vote down vote up
def __init__(self, text, wordlistfolder):
        self.text = text
        self.tokens = nltk.word_tokenize(text)
        self.sentenses = nltk.sent_tokenize(text)
        self.tags = nltk.pos_tag(self.tokens)

        self.featspace = []

        self.psykfeatspace(self.featspace, wordlistfolder)
        self.bigrams(self.featspace)
        self.number_count(self.featspace)
        self.punc_count(self.featspace)
        self.big_word_count(self.featspace)
        self.words_per_sentence(self.featspace)
        self.sentence_count(self.featspace)
        self.countPOS(self.featspace, 'CC')
        self.countPOS(self.featspace, 'NP')
        self.countPOS(self.featspace, 'NNP')
        self.words(self.featspace)
        self.stem(self.featspace)

    # Counts a specific POS tags 
Example #9
Source File: text.py    From pliers with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _from_text(self, text, unit, tokenizer, language):

        if tokenizer is not None:
            if isinstance(tokenizer, str):
                tokens = re.findall(tokenizer, text)
            else:
                tokens = tokenizer.tokenize(text)
        else:
            import nltk

            @requires_nltk_corpus
            def tokenize_text(text):
                if unit == 'word':
                    return nltk.word_tokenize(text, language)
                elif unit.startswith('sent'):
                    return nltk.sent_tokenize(text, language)
                else:
                    raise ValueError(
                        "unit must be either 'word' or 'sentence'")

            tokens = tokenize_text(text)

        for i, t in enumerate(tokens):
            self._elements.append(TextStim(text=t, onset=None, duration=None,
                                  order=i)) 
Example #10
Source File: sentence.py    From Valx with GNU General Public License v3.0 6 votes vote down vote up
def sentence_splitting (texts, slen = 1):
	if len(texts) <= 0:
		return []
	
	# splitting
	sentences = []
	text_sents = sent_tokenize(texts)
	if (text_sents != [''] and len(text_sents) >  0):
		for sent in text_sents:
			sent = sent.strip().split('\r') # split strings that contains "\r"
			for sen in sent:
				se = sen.split('. ')
				for s in se: 
					if (NLP_word.words_counting(s) >= slen):
						sentences.append(s)

	return sentences


# splitting text into Sentences using NLTK tokenization 
Example #11
Source File: nltk_plugin.py    From self-attentive-parser with MIT License 6 votes vote down vote up
def parse_sents(self, sents):
        """
        Parse multiple sentences

        If "sents" is a string, it will be segmented into sentences using NLTK.
        Otherwise, each element of "sents" will be treated as a sentence.

        sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse

        Returns: Iter[nltk.Tree]
        """
        if isinstance(sents, STRING_TYPES):
            if self._tokenizer_lang is None:
                raise ValueError(
                    "No tokenizer available for this language. "
                    "Please split into individual sentences and tokens "
                    "before calling the parser."
                    )
            sents = nltk.sent_tokenize(sents, self._tokenizer_lang)

        for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)):
            yield self._make_nltk_tree(sentence, tags_raw, *parse_raw) 
Example #12
Source File: preprocessor.py    From atap with Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ] 
Example #13
Source File: harvesttext.py    From HarvestText with MIT License 5 votes vote down vote up
def cut_sentences(self, para, drop_empty_line=True, strip=True, deduplicate=False):
        '''cut_sentences

        :param para: 输入文本
        :param drop_empty_line: 是否丢弃空行
        :param strip: 是否对每一句话做一次strip
        :param deduplicate: 是否对连续标点去重,帮助对连续标点结尾的句子分句
        :return: sentences: list of str
        '''
        if deduplicate:
            para = re.sub(r"([。!?\!\?])\1+", r"\1", para)

        if self.language == 'en':
            from nltk import sent_tokenize
            sents = sent_tokenize(para)
            if strip:
                sents = [x.strip() for x in sents]
            if drop_empty_line:
                sents = [x for x in sents if len(x.strip()) > 0]
            return sents
        else:
            para = re.sub('([。!?\?!])([^”’])', r"\1\n\2", para)  # 单字符断句符
            para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
            para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
            para = re.sub('([。!?\?!][”’])([^,。!?\?])', r'\1\n\2', para)
            # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
            para = para.rstrip()  # 段尾如果有多余的\n就去掉它
            # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
            sentences = para.split("\n")
            if strip:
                sentences = [sent.strip() for sent in sentences]
            if drop_empty_line:
                sentences = [sent for sent in sentences if len(sent.strip()) > 0]
            return sentences 
Example #14
Source File: find_full_text_sentence.py    From indra with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def sentence_tokenize(self, text):
        #return text.split('.')
        return sent_tokenize(text) 
Example #15
Source File: tokenize.py    From pywsd with MIT License 5 votes vote down vote up
def word_tokenize(text, language='english', preserve_line=False):
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [token for sent in sentences
            for token in _treebank_word_tokenizer.tokenize(sent)] 
Example #16
Source File: zoo.py    From razdel with MIT License 5 votes vote down vote up
def nltk_sentenize(text):
    from nltk import sent_tokenize

    chunks = sent_tokenize(text, 'russian')
    return find_substrings(chunks, text) 
Example #17
Source File: analyzer.py    From homer with MIT License 5 votes vote down vote up
def __init__(self, paragraph):
        paragraph = paragraph.replace('—', ' ')
        self.paragraph = paragraph
        self.tokenized_sentences = nltk.sent_tokenize(paragraph)
        self._sentences = [Sentence(sentence) for sentence in self.tokenized_sentences] 
Example #18
Source File: SensationalismClassifier.py    From news-audit with GNU General Public License v3.0 5 votes vote down vote up
def transform(self, text_fields):
        stats = []
        punctuation = string.punctuation
        abvs = ['CNN', 'FBI', 'ABC', 'MSNBC', 'GOP', 'U.S.', 'US', 'ISIS', 'DNC', 'TV', 'CIA', 'I', 'AP', 'PM', 'AM', 'EU', 'USA', 'UK', 'UN', 'CEO', 'NASA', 'LGBT', 'LGBTQ', 'NAFTA', 'ACLU']
        for field in text_fields:
            field_stats = {}
            tok_text = nltk.word_tokenize(field)
            try:
                num_upper = float(len([w for w in tok_text if w.isupper() and w not in abvs]))/len(tok_text)
            except:
                num_upper = 0
            try:
                num_punct = float(len([ch for ch in field if ch in punctuation]))/len(field)
            except:
                num_punct = 0   
            try:
                sent_lengths = [len(nltk.word_tokenize(s)) for s in nltk.sent_tokenize(field)]
                av_sent_len = float(sum(sent_lengths))/len(sent_lengths)
            except:
                av_sent_len = 0
            try:
                num_prof = float(len([w for w in tok_text if w.lower() in PROFANITY]))/len(tok_text)
            except:
                num_prof = 0

            polarity, subjectivity = sentiment(field)
            field_stats['all_caps'] = num_upper
            field_stats['sent_len'] = av_sent_len
            field_stats['polarity'] = polarity
            field_stats['subjectivity'] = subjectivity
            field_stats['profanity'] = num_prof
            stats.append(field_stats)
        return stats 
Example #19
Source File: sent_parsing.py    From atap with Apache License 2.0 5 votes vote down vote up
def sents(paragraph):
    for sentence in sent_tokenize(paragraph):
        yield sentence 
Example #20
Source File: readability_indices.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def coleman_liau_index(text, token_count):
    """
    Takes a text and returns its Coleman Liau Index
    :param text: A string text
    :return: Coleman Liau Index
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)

    def char_count(text):
        """
        Function to return total character counts in a text
        """
        count_chars = 0
        text = text.replace(" ", "")
        for char in text:
            if char not in string.punctuation:
                count_chars += 1
        return count_chars

    def avg_letters_per_word(text):
        ALPW = float(float(char_count(text)) / token_count)
        return ALPW

    def avg_sentence_per_word(text):
        ASPW = float(len(nltk.sent_tokenize(text)) / float(token_count))
        return ASPW

    if token_count <= 0:
        return 0

    L = avg_letters_per_word(text) * 100  # avg letters per 100 words
    S = avg_sentence_per_word(text) * 100  # avg sentences per 100 words
    CLI = float((0.0588 * L) - (0.296 * S) - 15.8)
    return CLI 
Example #21
Source File: preprocess.py    From atap with Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ] 
Example #22
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def sents(self, fileids=None, categories=None):
        """
        Uses the built in sentence tokenizer to extract sentences from the
        paragraphs. Note that this method uses BeautifulSoup to parse HTML.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in sent_tokenize(paragraph):
                yield sentence 
Example #23
Source File: readability_indices.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def automated_readability_index(text, token_count):
    """
    Takes a text and returns its Automated Readability Index
    :param text: A string text without punctuation
    :return: Automated Readability Index
    """

    # Partly extracted of textstat 0.3.1 which is only available for python 2 (https://github.com/shivam5992/textstat)
    def char_count(text):
        """
        Function to return total character counts in a text
        """
        count_chars = 0
        text = text.replace(" ", "")
        for char in text:
            if char not in string.punctuation:
                count_chars += 1
        return count_chars

    chrs = char_count(text)
    wrds = token_count
    snts = len(nltk.sent_tokenize(text))

    if wrds == 0 or snts == 0:
        return 0

    a = (float(chrs) / float(wrds))
    b = (float(wrds) / float(snts))
    ARI = (4.71 * a) + (0.5 * b) - 21.43
    return ARI 
Example #24
Source File: oz.py    From atap with Apache License 2.0 5 votes vote down vote up
def matrix(text, cast):
    mtx = []
    for first in cast:
        row = []
        for second in cast:
            count = 0
            for title, chapter in text['chapters'].items():
                for sent in sent_tokenize(chapter):
                    if first in sent and second in sent:
                        count += 1
            row.append(count)
        mtx.append(row)
    return mtx 
Example #25
Source File: oz.py    From atap with Apache License 2.0 5 votes vote down vote up
def cooccurrence(text, cast):
    possible_pairs = list(itertools.combinations(cast, 2))
    cooccurring = dict.fromkeys(possible_pairs, 0)
    for title, chapter in text['chapters'].items():
        for sent in sent_tokenize(chapter):
            for pair in possible_pairs:
                if pair[0] in sent and pair[1] in sent:
                    cooccurring[pair] += 1
    return cooccurring 
Example #26
Source File: readability_indices.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def lix_index(text, token_count):
    """
    A readability measure developed by Carl-Hug Björnsson
    Formula adapted from: https://en.wikipedia.org/wiki/LIX
    :param text: A string text without punctuation
    :return: LIX Index
    """

    def get_long_word_count(text):
        """
        Returns the number of words with more than 6 letters
        """
        long_word_count = 0
        for word in nltk.word_tokenize(text):
            if len(word) > 6:
                long_word_count += 1
        return long_word_count

    A = token_count  # number of words
    B = 0  # number of sentences (also split at ':')
    for sent in nltk.sent_tokenize(text):
        B += len(re.split(':', sent))
    C = get_long_word_count(text)  # number of words with more than 6 letters
    if B > 0 and A > 0:
        LIX = float(A / B) + float((C * 100) / A)
        return LIX
    else:
        return 0 
Example #27
Source File: readability_indices.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def rix_index(text):
    """
    A readability measure developed by Anderson, simplifies LIX index
    Anderson, Jonathan. "Analysing the Radability of English and Non-English
    Texts in the Classroom with Lix"
    source: http://www.jstor.org/stable/40031755?seq=1#page_scan_tab_contents
    :param text: A string text without punctuation
    :return: RIX Index
    """

    def get_long_word_count(text):
        """
        Returns the number of words with more than 6 letters
        """
        long_word_count = 0
        for word in nltk.word_tokenize(text):
            if len(word) > 6:
                long_word_count += 1
        return long_word_count

    sent_count = 0  # number of sentences (also split at ':' and ';')
    for sent in nltk.sent_tokenize(text):
        sent_count += len(re.split('[:;]', sent))

    long_word_count = get_long_word_count(text)  # number of words with mroe than 6 letters

    if sent_count > 0:
        return float(long_word_count / sent_count)
    else:
        return 0 
Example #28
Source File: readability_indices.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def mcalpine_eflaw_index(text):
    """
    A readability score defined by Rachel McAlpine
    See https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/

    EFLAW index = (#tokens + #miniwords) / #sentences

    :param text: A string text without punctuation
    :return: McAlpine EFLAW Index
    """

    tokenized_sents = nltk.sent_tokenize(text)
    sentence_count = len(tokenized_sents)
    token_count = 0
    miniword_count = 0  # words with 1,2 or 3 letters
    for sent in tokenized_sents:
        for token in nltk.word_tokenize(sent):
            if token not in string.punctuation:
                token_count += 1
                if len(token) <= 3:
                    miniword_count += 1

    if sentence_count >= 1:
        return float((token_count + miniword_count) / sentence_count)
    else:
        return 0 
Example #29
Source File: ngrams.py    From atap with Apache License 2.0 5 votes vote down vote up
def ngrams2(text, n=2):
    for sent in sent_tokenize(text):
        sent = word_tokenize(sent)
        for ngram in nltk_ngrams(sent, n):
            yield ngram 
Example #30
Source File: readability.py    From serapis with MIT License 5 votes vote down vote up
def __init__(self, doc):
        """
        Args:
            doc: str
        """
        self.doc = unidecode(doc)
        self.sentence_count = len(sent_tokenize(doc))
        words = word_tokenize(doc)
        syllables = [self._count_syllables(word) for word in words]
        self.char_count = sum(len(word) for word in words)
        self.syllable_count = sum(syllables)
        self._invalid = not self.sentence_count or not self.char_count
        self.complex_word_count = len(filter(lambda s: s >= 4, syllables))
        self.word_count = len(words)
        self.words_per_sentence = 1.0 * self.word_count / self.sentence_count if not self._invalid else 0