Python nltk.sent_tokenize() Examples

The following are code examples for showing how to use nltk.sent_tokenize(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: gmaneLegacy   Author: ttm   File: textUtils.py    The Unlicense 7 votes vote down vote up
def medidasSentencas(T):
    TS=k.sent_tokenize(T)
    tokens_sentences=[k.tokenize.wordpunct_tokenize(i) for i in TS] ### Para os POS tags
    knownw_sentences=[[i for i in ts if (i not in stopwords) and (i in WL_)] for ts in tokens_sentences]
    stopw_sentences =[[i for i in ts if i in stopwords] for ts in tokens_sentences]
    puncts_sentences=[[i for i in ts if
         (len(i)==sum([(ii in puncts) for ii in i]))]
         for ts in tokens_sentences] #
    Mchars_sents,  Schars_sents  = mediaDesvio_(TS)
    Mtoks_sents,   Stoks_sents   = mediaDesvio_(tokens_sentences)
    Mknownw_sents, Sknownw_sents = mediaDesvio_(knownw_sentences)
    Mstopw_sents,  Sstopw_sents  = mediaDesvio_(stopw_sentences)
    Mpuncts_sents, Spuncts_sents = mediaDesvio_(puncts_sentences)
    nsents=len(TS)
    mvars=("Mchars_sents","Schars_sents",
            "Mtoks_sents","Stoks_sents",
            "Mknownw_sents","Sknownw_sents",
            "Mstopw_sents","Sstopw_sents",
            "Mpuncts_sents","Spuncts_sents","nsents",
            "tokens_sentences")
    vdict={}
    for mvar in mvars:
        vdict[mvar] = locals()[mvar]
    return vdict 
Example 2
Project: gmaneLegacy   Author: ttm   File: textUtils.py    The Unlicense 6 votes vote down vote up
def medidasTamanhosSentencas(T,medidas_tokens):
    MT=medidas_tokens
    ############
    # medidas de sentencas
    TS=k.sent_tokenize(T)
    # media e desvio de numero de caracteres por sentenca
    tTS=[len(i) for i in TS]
    mtTS=n.mean(tTS) #
    dtTS=n.std(tTS) #
    
    # media e desvio do tamanho das sentencas em tokens
    sTS=[k.tokenize.wordpunct_tokenize(i) for i in TS] ### Para os POS tags
    tsTS=[len(i) for i in sTS]
    mtsTS=n.mean(tsTS) #
    dtsTS=n.std(tsTS) #

    # media e desvio do tamanho das sentencas em palavras conhecidas
    kw_=MT["kw_"]
    tsTSkw=[len([ii for ii in i if ii in kw_]) for i in sTS]
    mtsTSkw=n.mean(tsTSkw) #
    dtsTSkw=n.std(tsTSkw) #

    # media e desvio do tamanho das sentencas em palavras que retornam synsets e nao sao stopwords
    pv_=MT["kwssnsw_"]
    tsTSpv=[len([ii for ii in i if ii in pv_]) for i in sTS]
    mtsTSpv=n.mean(tsTSpv) #
    dtsTSpv=n.std(tsTSpv) #

    mvars=("mtTS","dtTS","mtsTS","dtsTS","mtsTSkw","dtsTSkw",
           "mtsTSpv","dtsTSpv","sTS")
    vdict={}
    for mvar in mvars:
        vdict[mvar] = locals()[mvar]
    return vdict 
Example 3
Project: gmaneLegacy   Author: ttm   File: textUtils.py    The Unlicense 6 votes vote down vote up
def medidasTamanhosMensagens(ds, tids=None):
    if not tids:
        mT=[ds.messages[i][3] for i in ds.message_ids]
    else:
        mT=[ds.messages[i][3] for i in tids]

    tmT=[len(t) for t in mT] # chars
    ttmT=[len(k.tokenize.wordpunct_tokenize(t)) for t in mT] # tokens
    tsmT=[len(k.sent_tokenize(t)) for t in mT] # sentences

    mtmT=n.mean(tmT)
    dtmT=n.std(tmT)
    mttmT=n.mean(ttmT)
    dttmT=n.std(ttmT)
    mtsmT=n.mean(tsmT)
    dtsmT=n.std(tsmT)
    mvars=("mtmT","dtmT","mttmT","dttmT","mtsmT","dtsmT")
    vdict={}
    for mvar in mvars:
        vdict[mvar] = locals()[mvar]
    return vdict 
Example 4
Project: formality_emnlp19   Author: jimth001   File: multi_process_tokenizer.py    MIT License 6 votes vote down vote up
def tokenizer(strings:'list of string',type:str='word',join:bool=True)->'list of results':
    assert type=='word' or type=='sen'
    results=[]
    if type=='word':
        for s in strings:
            if join:
                results.append(' '.join(nltk.word_tokenize(s)))
            else:
                results.append(nltk.word_tokenize(s))
    else:
        for s in strings:
            if join:
                results.append(' '.join(nltk.sent_tokenize(s)))
            else:
                results.append(nltk.sent_tokenize(s))
    return results 
Example 5
Project: dl-models-for-qa   Author: sujitpal   File: kaggle.py    Apache License 2.0 6 votes vote down vote up
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples 
Example 6
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 6 votes vote down vote up
def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma 
Example 7
Project: d-bert   Author: castorini   File: read_vectors.py    MIT License 6 votes vote down vote up
def main():
    config = read_args(default_config="confs/bert.json")
    with open(config.input_file) as f:
        content = f.read()
    sentences = nltk.sent_tokenize(content)
    random.shuffle(sentences)
    print(f"Read {len(sentences)} sentences.")
    vecs = []
    for sent in tqdm(sentences):
        h = hashlib.md5(sent.encode()).hexdigest()
        if config.lookup_word not in sent.lower():
            continue
        path = os.path.join(config.output_folder, h)
        if not os.path.exists(path):
            continue
        try:
            toks, tok_vecs = torch.load(path)
        except:
            print(path)
            continue
        for w, v in zip(toks, tok_vecs.split(1, 1)):
            if w == config.lookup_word:
                vecs.append(v)
    torch.save(vecs, f"{config.lookup_word}-vecs.pt") 
Example 8
Project: d-bert   Author: castorini   File: extract_bert_vectors.py    MIT License 6 votes vote down vote up
def main():
    config = read_args(default_config="confs/bert.json")
    bert = mod.BertWrapper.load(config.bert_model)
    with open(config.input_file) as f:
        content = f.read()
    sentences = nltk.sent_tokenize(content)
    random.shuffle(sentences)
    print(f"Read {len(sentences)} sentences.")
    try:
        os.makedirs(config.output_folder)
    except:
        pass
    for sent in tqdm(sentences):
        h = hashlib.md5(sent.encode()).hexdigest()
        path = os.path.join(config.output_folder, h)
        if os.path.exists(path):
            continue
        try:
            out = bert.extract_vectors(sent)
        except:
            continue
        torch.save(out, path) 
Example 9
Project: self-attentive-parser   Author: nikitakit   File: nltk_plugin.py    MIT License 6 votes vote down vote up
def parse_sents(self, sents):
        """
        Parse multiple sentences

        If "sents" is a string, it will be segmented into sentences using NLTK.
        Otherwise, each element of "sents" will be treated as a sentence.

        sents (str or Iterable[str] or Iterable[List[str]]): sentences to parse

        Returns: Iter[nltk.Tree]
        """
        if isinstance(sents, STRING_TYPES):
            if self._tokenizer_lang is None:
                raise ValueError(
                    "No tokenizer available for this language. "
                    "Please split into individual sentences and tokens "
                    "before calling the parser."
                    )
            sents = nltk.sent_tokenize(sents, self._tokenizer_lang)

        for parse_raw, tags_raw, sentence in self._batched_parsed_raw(self._nltk_process_sents(sents)):
            yield self._make_nltk_tree(sentence, tags_raw, *parse_raw) 
Example 10
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_labels(content,inits):
    rngs = []
    
    rngs.sort() # just in case, range lists look sorted
    acum_sent_len = [-1]
    sents = nltk.sent_tokenize(content)
    for sent in sents:
        acum_sent_len.append( len(sent) + acum_sent_len[-1] + 1) # adds blank space lost in sent_tokenization
    nsents = len(acum_sent_len)-1
    labels  = np.zeros(nsents,dtype=int)
    for lb in inits:
        sid = -1
        for i in xrange(1,nsents+1):
            if lb < acum_sent_len[i]:
                sid = i-1
                break
        labels[sid] = 1
        if sid==-1:
            pdb.set_trace()

    return labels 
Example 11
Project: BuildingMachineLearning   Author: ademyanchuk   File: classify.py    MIT License 6 votes vote down vote up
def prepare_sent_features():
    for pid, text in fetch_posts(chosen, with_index=True):
        if not text:
            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
        else:
            from platform import python_version
            if python_version().startswith('2'):
                text = text.decode('utf-8')
            sent_lens = [len(nltk.word_tokenize(
                sent)) for sent in nltk.sent_tokenize(text)]
            meta[pid]['AvgSentLen'] = np.mean(sent_lens)
            meta[pid]['AvgWordLen'] = np.mean(
                [len(w) for w in nltk.word_tokenize(text)])

        meta[pid]['NumAllCaps'] = np.sum(
            [word.isupper() for word in nltk.word_tokenize(text)])

        meta[pid]['NumExclams'] = text.count('!') 
Example 12
Project: YelpDataChallenge   Author: fujunswufe   File: representation.py    Apache License 2.0 6 votes vote down vote up
def get_sentence_tokens(text):
    '''
    Given a text(review), return the token list of each sentence
    :param text:
    :return:
    '''
    sentences = sent_tokenize(text)

    sent_tokens = []
    for sentence in sentences:
        sent_token = word_tokenize(sentence)
        sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))]
        sent_tokens.append(sent_token)
    # remove stop words and short tokens

    # stemming, experiment shows that stemming works nothing...
    # if (stemming):
    #     stemmer = PorterStemmer()
    #     texts = [[ stemmer.stem(token) for token in text] for text in texts]
    return sent_tokens 
Example 13
Project: YelpDataChallenge   Author: fujunswufe   File: reader.py    Apache License 2.0 6 votes vote down vote up
def get_review_sentences():
    '''
    Read the yelp review and return after sentence segmentattion
    :return:
    '''
    review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8')
    count_sentence = 0
    sentences = []

    for line in review_file:
        json_review = json.loads(line.strip())
        text = json_review.get("text").replace('\n','').lower()

        raw_sentences = sent_tokenize(text)
        for raw_sentence in raw_sentences:
            if len(raw_sentence.strip()) > 0:
                sent_tokens = word_tokenize(raw_sentence)
                sentences.append(sent_tokens)
    return sentences 
Example 14
Project: controcurator   Author: ControCurator   File: summarization.py    MIT License 6 votes vote down vote up
def extract_nltk(comment):
    body = comment['text']
    entities = {}
    sentences = nltk.sent_tokenize(body)
    print(sentences)
    for sentence in sentences:

        words = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(words)
        chunks = nltk.ne_chunk(tagged)
        for chunk in chunks:
            if type(chunk) is nltk.Tree:
              t = ''.join(c[0] for c in chunk.leaves())
              entities[t] = chunk.label()
    #print entities
    return entities 
Example 15
Project: controcurator   Author: ControCurator   File: sentimentExtractor.py    MIT License 6 votes vote down vote up
def getSentiment(text):
		sentences = nltk.sent_tokenize(text)
		s = {}
		s['sentences'] = len(sentences)
		s['words'] = 0
		s['sentiment'] = 0
		s['positivity'] = 0
		s['negativity'] = 0
		s['intensity'] = 0
		
		for sentence in sentences:
			if sentence <> '.':
				sentence = sentence.replace(' .', '.')
		
				tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
				for word, treebank in tagged:
					sentiment = SentimentExtractor.get_sentiment_score_from_tagged(word, treebank, skipWordNetPos=[])
					if sentiment <> None and sentiment[0] != 0 and sentiment[1] != 0:
						s['words'] += 1
						s['positivity'] += sentiment[0]
						s['negativity'] += sentiment[1]
						s['sentiment'] = (s['positivity'] - s['negativity']) / s['words']
						s['intensity'] = (s['positivity'] + s['negativity']) / s['words']
		return s 
Example 16
Project: partisan-discourse   Author: DistrictDataLabs   File: nlp.py    Apache License 2.0 6 votes vote down vote up
def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e))) 
Example 17
Project: easy_learner   Author: lavizhao   File: rake.py    Apache License 2.0 6 votes vote down vote up
def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    #sentence_delimiters = re.compile(u'[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
    #sentences = sentence_delimiters.split(text)
    sentences = nltk.sent_tokenize(text)
    result = []
    for s in sentences:
        if s[-1] == '.' or s[-1] == "?" or s[-1] == "," or s[-1] == ';':
            result.append(s[:-1])
        else:
            result.append(s)
    
    return result#sentences 
Example 18
Project: NBA-Basketball-Web-Crawler   Author: ThomasDang93   File: webCrawler.py    Apache License 2.0 6 votes vote down vote up
def cleanup(rawfile):
    """Write a function to clean up the text. You might need to delete newlines and tabs.
    Extract sentences with NLTK’s sentence tokenizer. Write the sentences for each file to a new file.
    That is, if you have 15 files in, you have 15 files out.
    You might need to clean up the cleaned up files manually to delete irrelevant material."""
    cleanfile = 'clean_{}'.format(rawfile[4:38])
    script_dir = os.path.dirname(os.path.abspath(__file__))
    dest_dir = os.path.join(script_dir, 'clean')
    try:
        os.makedirs(dest_dir)
    except OSError:
        pass
    path = os.path.join(dest_dir, cleanfile)
    with open(rawfile) as f:
        with open(path, 'w') as output:
            text = f.read()

            mapping = [('\\n', ''), ('\\t', ''), ('\\s\\s+', ' '), ("\\'","'")]
            for k, v in mapping:
                text = text.replace(k, v)
            sentences = nltk.sent_tokenize(text)
            for s in sentences:
                output.write(s) 
Example 19
Project: golden   Author: terrosdesigns   File: golden.py    MIT License 6 votes vote down vote up
def content(soup, sentences=1):
    content_abstract = soup.find("div", {"class": "TopicDetail__body"})
    first_section = content_abstract.find("div", {"class": "TopicDetail__overview__block"})
    text_content = first_section.find("div", {"class": "Editor--article"})
    if not text_content:
        print("No content to display.")
        return
    if len(text_content.findAll("p", {"class": "Editor__text"})) == 0:
        return "No content to display"
    elif len(text_content.findAll("p", {"class": "Editor__text"})) == 1:
        try:
            content = text_content.findAll("p", {"class": "Editor__text"}).get_text()
        except:
            return "No content to display"
        content_sent = sent_tokenize(content)
        description = " ".join(content_sent[0:sentences])
    else:
        i=0
        content = []
        while i < len(text_content.findAll("p", {"class": "Editor__text"})):
            content.append(text_content.findAll("p", {"class": "Editor__text"})[i].get_text())
            i+=1
        description = " ".join(content[0:sentences])
    if description:
        return description 
Example 20
Project: dasem   Author: fnielsen   File: text.py    Apache License 2.0 6 votes vote down vote up
def sentence_tokenize(text):
    """Tokenize a Danish text into sentence.

    The model from NTLK trained on Danish is used.

    Parameters
    ----------
    text : str
        The text to be tokenized.

    Returns
    -------
    sentences : list of str
        Sentences as list of strings.

    Examples
    --------
    >>> text = 'Hvad!? Hvor har du f.eks. siddet?'
    >>> sentences = sentence_tokenize(text)
    >>> sentences
    ['Hvad!?', 'Hvor har du f.eks. siddet?']

    """
    return sent_tokenize(text, language='danish') 
Example 21
Project: transfer-learning-ner   Author: ciads-ut   File: makeconll.py    MIT License 6 votes vote down vote up
def parse_doc(filename, index):

    docs = ieer.parsed_docs(filename)
    dt = docs[index].text
    words = dt.leaves()
    tags = tree2conll_without_postags(dt)

    rr = nltk.sent_tokenize(' '.join(words))
    # small fixes:
    if filename =='NYT_19980315' and index==11:
        rr[8]=rr[8]+rr[9]
        rr.remove(rr[9])
    if filename == 'NYT_19980407':
        if index == 4:
            rr[19]=rr[19]+rr[20]
            rr.remove(rr[20])
        if index == 13:
            rr[9] = rr[9]+rr[10]
            rr.remove(rr[10])

    L =get_breaks(words, rr)
    L.append(len(tags)) # otherwise you miss the last sentence of the document.
    tags = [tags[L[i]:L[i+1]] for i in range(len(L)-1)]
    return tags 
Example 22
Project: ace2005-preprocessing   Author: nlpcl-lab   File: parser.py    MIT License 5 votes vote down vote up
def parse_sgm(self, sgm_path):
        with open(sgm_path, 'r') as f:
            soup = BeautifulSoup(f.read(), features='html.parser')
            self.sgm_text = soup.text

            doc_type = soup.doc.doctype.text.strip()

            def remove_tags(selector):
                tags = soup.findAll(selector)
                for tag in tags:
                    tag.extract()

            if doc_type == 'WEB TEXT':
                remove_tags('poster')
                remove_tags('postdate')
                remove_tags('subject')
            elif doc_type in ['CONVERSATION', 'STORY']:
                remove_tags('speaker')

            sents = []
            converted_text = soup.text

            for sent in nltk.sent_tokenize(converted_text):
                sents.extend(sent.split('\n\n'))
            sents = list(filter(lambda x: len(x) > 5, sents))
            sents = sents[1:]
            sents_with_pos = []
            last_pos = 0
            for sent in sents:
                pos = self.sgm_text.find(sent, last_pos)
                last_pos = pos
                sents_with_pos.append({
                    'text': sent,
                    'position': [pos, pos + len(sent)]
                })

            return sents_with_pos 
Example 23
Project: textsplitter   Author: timokoola   File: splitter.py    Apache License 2.0 5 votes vote down vote up
def tokenized_file(fn, encoding):
    f = io.open(fn, encoding=encoding)
    text = f.read()
    f.close()
    return sent_tokenize(text) 
Example 24
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def read_corpus():
    """open corpus file and create word and sentence tokens
    corpus file is the base brain for Tobot which contains words/sentences
    used by nltk and sklearn to help Tobot respond to questions"""
    f = open('tobot_corpus.txt', 'r', errors='ignore')
    raw = f.read()
    f.close()
    raw = raw.lower()
    #nltk.download('punkt')
    #nltk.download('wordnet')
    #nltk.download('stopwords')
    sent_tokens = nltk.sent_tokenize(raw)
    word_tokens = nltk.word_tokenize(raw)

    return sent_tokens, word_tokens 
Example 25
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def get_sentences(text):
    """Retrieve the sentences present in a given string of text.
    The return value is a list of sentences."""
    text = text.split('.')
    text = '. '.join(text).strip()
    sentList = nltk.sent_tokenize(text)
    return sentList 
Example 26
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    MIT License 5 votes vote down vote up
def tokenize_sentences(targets):
    while True:
        text = (yield)  # (yield) gets an item from an upstream step
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            for target in targets:
                target.send(sentence)  # send() sends data downstream 
Example 27
Project: gmaneLegacy   Author: ttm   File: textUtils.py    The Unlicense 5 votes vote down vote up
def medidasSinais(T):
    wtok=k.tokenize.wordpunct_tokenize(T)
    lens_tok=[len(i) for i in wtok]
    lens_word=[len(i) for i in wtok if (i not in stopwords) and (i in WL_)]
    lens_sent=[len(i) for i in k.sent_tokenize(T)]
    mvars=("lens_tok","lens_word","lens_sent")
    vdict={}
    for mvar in mvars:
        vdict[mvar] = locals()[mvar]
    return vdict 
Example 28
Project: gmaneLegacy   Author: ttm   File: textUtils.py    The Unlicense 5 votes vote down vote up
def medidasMensagens(ds,tids=None):
    # TTM
    if not tids:
        mT=[ds.messages[i][3] for i in ds.message_ids]
    else:
        mT=[ds.messages[i][3] for i in tids]
    tokens_msgs=[k.tokenize.wordpunct_tokenize(t) for t in mT] # tokens
    knownw_msgs=[[i for i in toks if (i not in stopwords) and (i in WL_)] for toks in tokens_msgs]
    stopw_msgs=[[i for i in toks if i in stopwords] for toks in tokens_msgs]
    puncts_msgs=[[i for i in toks if
         (len(i)==sum([(ii in puncts) for ii in i]))]
         for toks in tokens_msgs] #
    sents_msgs=[k.sent_tokenize(t) for t in mT] # tokens
    nmsgs=len(mT)
    toks_msgs=[len(i) for i in mT]
    Mchars_msgs,   Schars_msgs  = mediaDesvio_(mT)
    Mtokens_msgs,  Stokens_msgs = mediaDesvio_(tokens_msgs)
    Mknownw_msgs,  Sknownw_msgs = mediaDesvio_(knownw_msgs)
    Mstopw_msgs,   Sstopw_msgs  = mediaDesvio_(stopw_msgs)
    Mpuncts_msgs,  Spuncts_msgs = mediaDesvio_(puncts_msgs)
    Msents_msgs,Ssents_msgs     = mediaDesvio_(sents_msgs)
    mvars=("nmsgs","toks_msgs",
            "Msents_msgs","Ssents_msgs",
            "Mtokens_msgs","Stokens_msgs",
            "Mknownw_msgs","Sknownw_msgs",
            "Mstopw_msgs","Sstopw_msgs",
            "Mpuncts_msgs","Spuncts_msgs",
            "Mchars_msgs","Schars_msgs",
            )
    vdict={}
    for mvar in mvars:
        vdict[mvar] = locals()[mvar]
    return vdict 
Example 29
Project: qb   Author: Pinafore   File: cached_wikipedia.py    MIT License 5 votes vote down vote up
def extract_wiki_sentences(title, text, n_sentences, replace_title_mentions=''):
    """
    Extracts the first n_paragraphs from the text of a wikipedia page corresponding to the title.
    strip_title_mentions and replace_title_mentions control handling of references to the title in text.
    Oftentimes QA models learn *not* to answer entities mentioned in the question so this helps deal with this
    in the domain adaptation case.

    :param title: title of page
    :param text: text of page
    :param n_paragraphs: number of paragraphs to use
    :param replace_title_mentions: Replace mentions with the provided string token, by default removing them
    :return:
    """
    # Get simplest representation of title and text
    title = unidecode(title).replace('_', ' ')
    text = unidecode(text)

    # Split on non-alphanumeric
    title_words = re.split('[^a-zA-Z0-9]', title)
    title_word_pattern = '|'.join(re.escape(w.lower()) for w in title_words)

    # Breaking by newline yields paragraphs. Ignore the first since its always just the title
    paragraphs = [p for p in text.split('\n') if len(p) != 0][1:]
    sentences = []
    for p in paragraphs:
        formatted_text = re.sub(title_word_pattern, replace_title_mentions, p, flags=re.IGNORECASE)
        # Cleanup whitespace
        formatted_text = re.sub('\s+', ' ', formatted_text).strip()

        sentences.extend(nltk.sent_tokenize(formatted_text))

    return sentences[:n_sentences] 
Example 30
Project: EliIE   Author: Tian312   File: word2vec.py    MIT License 5 votes vote down vote up
def tokenize_train(train_directory,tokenized_directory):
    with codecs.open(train_directory, "r", "utf-8") as file:
	    with codecs.open(tokenized_directory, "w", "utf-8") as writer:
		    new_sens = []
		    for line in file:
			    sentences = sent_tokenize(line.strip())
			    for sen in sentences:

				    sen = word_tokenize(sen.lower())
				    new_sen = ' '.join(sen)
				    new_sens.append(new_sen)
				    writer.write(new_sen)
				    writer.write("\n")
    sentences = gensim.models.word2vec.LineSentence(tokenized_directory)
    return sentences 
Example 31
Project: EliIE   Author: Tian312   File: retrieve_texts.py    MIT License 5 votes vote down vote up
def sentence_splitting (texts, slen = 1):           # Split ec into seperated sentences.
    if len(texts) <= 0:
        return []

    # splitting
    sentences = []
    text_sents = nltk.sent_tokenize(texts)
    if (text_sents != [''] and len(text_sents) >  0):
        for sent in text_sents:
            sent=re.sub('e.g.','eg',sent)
            sent = sent.strip().split('\r') # split strings that contains "\r"
            for sen in sent:
                se = re.split('[.;]',sen)

                for s in se:
                    ss=s.split('-  ')
                    for final in ss:
                        #print final

                        match=re.match('^\d+\.\s*$',final)
                        if match:
                            continue
                        final=re.sub('\s+$','',final)
                        final=re.sub('\d+\.','',final)
                        final=final.encode('utf-8').decode('utf-8','ignore').encode("utf-8")
                        words=final.decode('ascii', 'ignore').split(' ')
                        new_words=[]
                        for w in words:
                            if w:
                                #print "=="+w+"=="
                                match=re.search('(\(*\w+\)*,*.*)',w)
                                if match:
                                    #print match.group(1)
                                    new_words.append(match.group(1))
                        new_sent=' '.join(new_words)
                        if new_sent:
                            sentences.append(new_sent)
                            #print new_sent


    return sentences 
Example 32
Project: luna   Author: blacchat   File: test.py    Apache License 2.0 5 votes vote down vote up
def analyze(text): 
    tokenized = nltk.sent_tokenize(text)
    sid = SentimentIntensityAnalyzer()
    sid.lexicon.update(words)
    scores = []
    compound = []
    for token in tokenized: 
        scores.append(sid.polarity_scores(token))
        print(token)
        print(sid.polarity_scores(token))
    for score in scores: 
        compound.append(score["compound"])
    return mean(compound) 
Example 33
Project: luna   Author: blacchat   File: luna.py    Apache License 2.0 5 votes vote down vote up
def analyze(text): 
    global sid 
    print("Analyzing text..")
    tokenized = nltk.sent_tokenize(text)
    compound = [] 
    scores = [sid.polarity_scores(token) for token in tokenized]
    compound = [score["compound"] for score in scores]
    return (mean(compound)-0.1)
    # adjusting downwards 
Example 34
Project: chowmein   Author: xiaohan2012   File: data.py    MIT License 5 votes vote down vote up
def load_line_corpus(path, tokenize=True):
    docs = []
    with codecs.open(path, "r", "utf8") as f:
        for l in f:
            if tokenize:
                sents = nltk.sent_tokenize(l.strip().lower())
                docs.append(list(itertools.chain(*map(
                    nltk.word_tokenize, sents))))
            else:
                docs.append(l.strip())
    return docs 
Example 35
Project: sentiment-analysis-api   Author: nileshsah   File: Cluster.py    MIT License 5 votes vote down vote up
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems 
Example 36
Project: sentiment-analysis-api   Author: nileshsah   File: Cluster.py    MIT License 5 votes vote down vote up
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens 
Example 37
Project: sentiment-analysis-api   Author: nileshsah   File: kMeans.py    MIT License 5 votes vote down vote up
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems 
Example 38
Project: sentiment-analysis-api   Author: nileshsah   File: kMeans.py    MIT License 5 votes vote down vote up
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens 
Example 39
Project: chatbot-data   Author: fangyihao   File: tokenizer.py    GNU General Public License v3.0 5 votes vote down vote up
def utter_tokenize(self, utterance):
        words = []
    
        # Extract sentences
        sentences = self.sent_tokenize(utterance)
    
        # We add sentence by sentence until we reach the maximum length
        for i in range(len(sentences)):
            tokens = self.word_tokenize(sentences[i])
            for token in tokens:
                    words.append(token)
        
        return words 
Example 40
Project: chatbot-data   Author: fangyihao   File: tokenizer.py    GNU General Public License v3.0 5 votes vote down vote up
def sent_tokenize(self, utterance):
        sentences = nltk.sent_tokenize(utterance)
        # correct the last sentence's segmentation
        fragment = ""
        last = len(sentences)
        for i in range(len(sentences)):
            m = re.match(r'^[ ]*["\?!\.\-*0-9]+[ ]*$', sentences[len(sentences) - 1 - i])
            if m:
                fragment = m.group(0) + fragment
                last = len(sentences) - 1 - i
            else:
                break
        if last > 0:
            sentences = sentences[:last]
            if fragment != "":
                sentences[last - 1] += fragment
        else:
            sentences = [fragment]
        
        fragment = ""    
        first = 0
        # correct the first sentence's segmentation
        for j in range(len(sentences)):
            m1 = re.search(r"[ \.]([A-Z]|No|Op|a\.k\.a|Sr)[\.]\s*$", sentences[j])
            if m1:
                fragment += sentences[j]
                first = j + 1
            else:
                break
        if first < len(sentences):
            sentences = sentences[first:]
            if fragment != "":
                sentences[0] = fragment + sentences[0]
        else:
            sentences = [fragment]
        return sentences 
Example 41
Project: haiku-generator   Author: nlsandler   File: markov.py    MIT License 5 votes vote down vote up
def _add_sentences(self, text):
        # First read in everything as one string,
        # So we can generate more than one sentence at a time
        self._update(text)
        # now make each sentence a starting point
        # first sentence already is, so ignore it
        sentences = nltk.sent_tokenize(text)[1:]
        for sentence in sentences:
            # make each sentence a starting point!
            self._add_chain_start(sentence) 
Example 42
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def get_sentences(document):
    sentences = nltk.sent_tokenize(document['fields']['content'][0])
    sentences = [tokenize(sent) for sent in sentences]
    final = []

    for sentence_num, sentence in enumerate(sentences):
        if len(sentence) == 0:
            continue

        final.append(gensim.models.doc2vec.TaggedDocument(
            words=sentence,
            tags=['{}_{}'.format(document['_id'], sentence_num)]
        ))

    return final 
Example 43
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def score_sentences(self, fileid):
        fileidx = self.corpus.fileids().index(fileid)
        for sent in nltk.sent_tokenize(self.corpus.raw(fileid)):
            score  = 0
            for token in self.tokenizer(sent):
                if token not in self.features: continue
                score += self.tdm[fileidx, self.features.index(token)]
            yield score, sent 
Example 44
Project: QAUniBonn   Author: jtrillos   File: nerQuestion.py    Apache License 2.0 5 votes vote down vote up
def extract_entity_question (question):

    sample = question
    sentences = nltk.sent_tokenize(sample) #split in to sentences
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] #split in to words
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] #tag sentences with NN, NNP, etc
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        # Print result tree
        # print tree
        # Print results per sentence
        # print extract_entity_names(tree)

        entity_names.extend(extract_entity_names(tree))

    # Print all entity names
    # print entity_names

    # Remove incorrect entity "which"
    if 'Which' in entity_names:
        entity_names.remove('Which')
    if 'which' in entity_names:
        entity_names.remove('Which')

    # Print unique entity names
    # print set(entity_names)
    return entity_names 
Example 45
Project: neural-complexity   Author: vansky   File: data.py    Apache License 2.0 5 votes vote down vote up
def sent_tokenize_with_unks(self, path):
        """ Tokenizes a text file into sentences, adding unks if needed. """
        assert os.path.exists(path), "Bad path: %s" % path
        all_ids = []
        sents = []
        if path[-2:] == 'gz':
            with gzip.open(path, 'rb') as file_handle:
                for fchunk in file_handle.readlines():
                    for line in sent_tokenize(fchunk.decode("utf-8")):
                        if line.strip() == '':
                            # Ignore blank lines
                            continue
                        sents.append(line.strip())
                        words = ['<eos>'] + line.split() + ['<eos>']
                        ids = self.convert_to_ids(words)
                        all_ids.append(ids)
        else:
            with open(path, 'r') as file_handle:
                for fchunk in file_handle:
                    for line in sent_tokenize(fchunk):
                        if line.strip() == '':
                            # Ignore blank lines
                            continue
                        sents.append(line.strip())
                        words = ['<eos>'] + line.split() + ['<eos>']
                        ids = self.convert_to_ids(words)
                        all_ids.append(ids)
        return (sents, all_ids) 
Example 46
Project: d-bert   Author: castorini   File: tokenize_imdb.py    MIT License 5 votes vote down vote up
def main():
    punc_error_patt = re.compile(r"([A-z])(\.|\?|\!)([A-z])")
    url_patt = re.compile(r"[[email protected]:%._\+~#=]{2,256}\.[a-z]{2,6}\b([[email protected]:%_\+.~#?&//=]*)")
    contiguous_space_patt = re.compile(r"\s+")

    for line in sys.stdin:
        rating, document = line.split("\t")
        rating = rating.index("1")
        if rating not in (3, 4, 5):
            continue
        document = re.sub(url_patt, "", document)
        document = re.sub(punc_error_patt, r"\1\2 \3", document)
        document = re.sub(contiguous_space_patt, " ", document)
        print("\n".join(nltk.sent_tokenize(document)).lower()) 
Example 47
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_labels(content,tok_rngs):
    rngs = []
    for str_rng in tok_rngs.split(','):
        temp = str_rng.split(':')
        l,r = int(temp[0]),int(temp[1])
        rngs.append( (l,r) )
    rngs.sort() # just in case, range lists look sorted
    acum_sent_len = [0]
    sents = nltk.sent_tokenize(content)
    for sent in sents:
        sent_len = len(sent.split())
        acum_sent_len.append( sent_len + acum_sent_len[-1] )
    nsents = len(acum_sent_len)-1
    labels  = np.zeros(nsents,dtype=int)
    for lb,rb in rngs:
        fst_sent = -1
        lst_sent = -1
        for i in xrange(1,nsents+1):
            if lb < acum_sent_len[i]:
                fst_sent = i-1
                break
        for i in xrange(fst_sent,nsents+1):
            if rb-1 < acum_sent_len[i]:
                lst_sent = i-1
                break
        labels[fst_sent:lst_sent+1] = 1
        if fst_sent>lst_sent:
            pdb.set_trace()

    return labels 
Example 48
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: sample.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def __init__(self,sid,content,question,labels):
    self.sid = sid
    sents = nltk.sent_tokenize(content)
    self.content = []
    for sent in sents:
      self.content.append( nltk.word_tokenize(sent) )
    self.question = nltk.word_tokenize(question)
    self.labels = labels 
Example 49
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: sample.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def __init__(self,sid,content,question,labels):
    self.sid = sid
    sents = nltk.sent_tokenize(content)
    self.content = []
    for sent in sents:
      self.content.append( nltk.word_tokenize(sent) )
    self.question = nltk.word_tokenize(question)
    self.labels = labels 
Example 50
Project: alta2017-rl   Author: dmollaaliod   File: __init__.py    GNU General Public License v3.0 5 votes vote down vote up
def yield_candidate_text(questiondata):
    """Yield all candidate text for a question
    >>> data = json.load(open("BioASQ-trainingDataset5b.json", encoding='utf-8'))['questions']
    >>> y = yield_candidate_text(data[1], snippets_only=True)
    >>> next(y)
    ('55046d5ff8aee20f27000007', 0, 'the epidermal growth factor receptor (EGFR) ligands, such as epidermal growth factor (EGF) and amphiregulin (AREG)')
    >>> next(y)
    ('55046d5ff8aee20f27000007', 1, ' EGFR ligands epidermal growth factor (EGF), amphiregulin (AREG) and transforming growth factor alpha (TGFα)')
"""
    past_pubmed = set()
    sn_i = 0
    for sn in questiondata['snippets']:
        for s in sent_tokenize(sn['text']):
            yield (questiondata['id'], sn_i, s)
            sn_i += 1 
Example 51
Project: alta2017-rl   Author: dmollaaliod   File: my_tokenizer.py    GNU General Public License v3.0 5 votes vote down vote up
def my_tokenize(string):
    """Return the list of tokens.
    >>> my_tokenize("This is a sentence. This is another sentence.")
    ['sentence', 'another', 'sentence']
    """
    return [w.lower() 
            for s in nltk.sent_tokenize(string) 
            for w in nltk.word_tokenize(s)
            if w.lower() not in stopwords.words('english') and
               w not in [',','.',';','(',')','"',"'",'=',':','%','[',']']] 
Example 52
Project: summarize   Author: danieldeutsch   File: setup.py    Apache License 2.0 5 votes vote down vote up
def load_dpp(root_path: str) -> List[List[str]]:
    glob_string = os.path.join(root_path, 'SOAsums/DPP/*.DPP')
    summaries = []
    for file_path in sorted(glob(glob_string)):
        summary_text = open(file_path, 'r').read()
        summary = nltk.sent_tokenize(summary_text)
        summaries.append(summary)
    return summaries 
Example 53
Project: gender-bias   Author: gender-bias   File: document.py    MIT License 5 votes vote down vote up
def sentences(self) -> List[str]:
        """
        Compute a list of sentences.

        Uses nltk.sent_tokenize.

        Returns:
            List[str]

        """
        return [s.replace('\n', ' ') for s in nltk.sent_tokenize(self._text)] 
Example 54
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: nlp_pos_alternative.py    GNU General Public License v2.0 5 votes vote down vote up
def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data 
Example 55
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: nlp_countWords.py    GNU General Public License v2.0 5 votes vote down vote up
def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data 
Example 56
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: nlp_pos.py    GNU General Public License v2.0 5 votes vote down vote up
def preprocess_data(text):
    global sentences, tokenized
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    sentences =  nltk.sent_tokenize(text)
    tokenized = [tokenizer.tokenize(s) for s in sentences]

# import the data 
Example 57
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: classify.py    MIT License 5 votes vote down vote up
def prepare_sent_features():
    for pid, text in fetch_posts(chosen, with_index=True):
        if not text:
            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
        else:
            sent_lens = [len(nltk.word_tokenize(
                sent)) for sent in nltk.sent_tokenize(text)]
            meta[pid]['AvgSentLen'] = np.mean(sent_lens)
            meta[pid]['AvgWordLen'] = np.mean(
                [len(w) for w in nltk.word_tokenize(text)])

        meta[pid]['NumAllCaps'] = np.sum(
            [word.isupper() for word in nltk.word_tokenize(text)])

        meta[pid]['NumExclams'] = text.count('!') 
Example 58
Project: YelpDataChallenge   Author: fujunswufe   File: textrank_fujun.py    Apache License 2.0 5 votes vote down vote up
def print_summary(indexes, doc, extract_n, doc_index):

    if len(indexes) < extract_n:
        extract_n = len(indexes)

    reference = "reference/task" + str(doc_index) + "_englishReference" + str(doc_index) + ".txt"
    reference_output = io.open(reference, "w", encoding='utf8')
    tips = sent_tokenize(doc.tip)

    for tip in tips:
        reference_output.write(tip + "\n")
    reference_output.close()

    sentences = sent_tokenize(doc.review)
    
    #print ""
    ## print "sentences length: " + str(len(sentences))
    #print ""
    #print "indexes: " + str(indexes)
    #print ""
    
    system = "system/task" + str(doc_index) + "_englishSyssum" + str(doc_index) + ".txt"
    system_output = io.open(system, "w", encoding='utf8')    
    for i in range(0, extract_n):
        #print "index: " + str(indexes[i])
        system_output.write(sentences[indexes[i]] + "\n")

    system_output.close() 
Example 59
Project: ewe_ebooks   Author: jaymcgrath   File: bookstopher.py    MIT License 5 votes vote down vote up
def __init__(self, body, author='Anonymous'):

        # accumulators
        hashtags = []

        # Now process cleaned up text with NLTK
        words = []
        bigrams = []
        trigrams = []
        quadgrams = []
        sentences = []


        words = word_tokenize(body)

        sentences.extend(sent_tokenize(body))

        # Strip whitespace from each sentence
        sentences = [sentence.strip() for sentence in sentences]

        bigrams = ngrams(body, 2)
        trigrams = ngrams(body, 3)
        quadgrams = ngrams(body, 4)

        self.body = body
        self.words = words
        self.bigrams = bigrams
        self.trigrams = trigrams
        self.quadgrams = quadgrams
        self.sentences = sentences
        self.hashtags = hashtags
        self.author = author

        #TODO: Create "hashtags" from arbitrary number of rarest words 
Example 60
Project: controcurator   Author: ControCurator   File: clusterComments.py    MIT License 5 votes vote down vote up
def tokenize_and_stem(text):
	# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
	tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
	filtered_tokens = []
	# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
	for token in tokens:
		if re.search('[a-zA-Z]', token):
			filtered_tokens.append(token)
	stems = [stemmer.stem(t) for t in filtered_tokens]
	return stems 
Example 61
Project: controcurator   Author: ControCurator   File: article.py    MIT License 5 votes vote down vote up
def getSentences(self, texts):
		sentences = []
		[sentences.extend(nltk.sent_tokenize(text)) for text in texts]
		return [sentence.replace(' .', '.') for sentence in sentences if sentence <> '.'] 
Example 62
Project: deep-summarization-toolkit   Author: gooppe   File: utils.py    MIT License 5 votes vote down vote up
def split_into_sentences(text: str) -> typing.List[str]:
    """Split text into sentences.

    Args:
        text (str): Text.

    Returns:
        typing.List[str]: Sentences.
    """

    return nltk.sent_tokenize(text) 
Example 63
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.5 Skipgram_Keras.py    MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 64
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.2 Email_Classification.py    MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 65
Project: Resume-Mining-Machine-Learning-Neo4j   Author: erayon   File: CleanResume.py    GNU General Public License v3.0 5 votes vote down vote up
def plan_text(path):
	'''
	Return in clean plane text without stop words in it  /// 
	'''
	proc = subprocess.Popen(['pdf2txt.py',path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
	temp=proc.communicate()[0]
	temp = temp.decode('ascii', errors='ignore') 
	cleanText = re.sub("\n", "", temp)
	document = " ".join([i for i in cleanText.split() if i not in stop])
	sentences = nltk.sent_tokenize(document)
	cleanText=" ".join(sentences)
	return cleanText 
Example 66
Project: DeepTriage   Author: huazhisong   File: prepocessing_bugs.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def read_lines(file_path):
    # open description file
    with open(file_path, encoding='latin2') as f:
        # remove last 5 lines
        lines_raw = f.readlines()
        # read lines specially
        selected_lines = clean_raw(lines_raw)
        # raw text
        raw_text = ' '.join(selected_lines)
        # decode utf8 coding
        raw_text = raw_text.encode('utf8').decode('utf8')
        # sentences tokinzer
        sentences = nltk.sent_tokenize(raw_text)
        tokens = []
        # dealing words
        wnl = nltk.WordNetLemmatizer()
        english_stopwords = stopwords.words('english')
        for sentence in sentences:
            # cean raw sentence
            sentence = clean_raw_cnn(sentence)
            # words tokenizer
            raw_words = nltk.word_tokenize(sentence)
            # clearn word
            tmp = clean_words(raw_words, wnl, english_stopwords)
            tokens.extend(tmp)

        assert len(tokens) > 0
        line = ' '.join(tokens)

    return line 
Example 67
Project: PRSummarizer   Author: Tbabm   File: utils.py    MIT License 5 votes vote down vote up
def write_for_rouge(reference, decoded_words, ex_index, _rouge_ref_dir, _rouge_dec_dir):
    """
    require un_sent_tokenize text, and will use ntlk to conduct sent_tokenize
    """
    decoded_abstract = " ".join(decoded_words)
    write_for_rouge_raw(reference, decoded_abstract, ex_index, _rouge_ref_dir, _rouge_dec_dir) 
Example 68
Project: PRSummarizer   Author: Tbabm   File: utils.py    MIT License 5 votes vote down vote up
def prepare_rouge_text(text):
    # replace <nl> to \n
    text = replace_nl(text)
    # pyrouge calls a perl script that puts the data into HTML files.
    # Therefore we need to make our output HTML safe.
    text = make_html_safe(text)
    sents = sent_tokenize(text)
    text = "\n".join(sents)
    return text 
Example 69
Project: PRSummarizer   Author: Tbabm   File: utils.py    MIT License 5 votes vote down vote up
def write_for_rouge_raw(reference, decoded_abstract, ex_index, _rouge_ref_dir, _rouge_dec_dir):
    """
    require un_sent_tokenize text, and will use ntlk to conduct sent_tokenize
    """
    decoded_abstract = prepare_rouge_text(decoded_abstract)
    reference = prepare_rouge_text(reference)

    ref_file = get_ref_file(_rouge_ref_dir, ex_index)
    decoded_file = get_dec_file(_rouge_dec_dir, ex_index)

    with open(ref_file, "w") as f:
        f.write(reference)
    with open(decoded_file, "w") as f:
        f.write(decoded_abstract)
    # print("Wrote example %i to file" % ex_index) 
Example 70
Project: PRSummarizer   Author: Tbabm   File: utils.py    MIT License 5 votes vote down vote up
def ext_art_sent_tokenize(text):
    art = ext_art_preprocess(text)
    art_sents = sent_tokenize(art)
    return art_sents 
Example 71
Project: PRSummarizer   Author: Tbabm   File: utils.py    MIT License 5 votes vote down vote up
def ext_abs_sent_tokenize(text):
    return sent_tokenize(text) 
Example 72
Project: bookish-invention   Author: Shashankjain12   File: main.py    GNU General Public License v3.0 5 votes vote down vote up
def pdfnotes(self):
        if self.p[1]==".pdf":
            pdfFileObject = open('/home/shashank/Downloads/'+self.file_name, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObject)

            count = pdfReader.numPages
            sentence=[]
            word_tags=[]
            for i in range(count):
                page = pdfReader.getPage(i)
                sentence.append(page.extractText().split('\n'))
                sentences=nltk.sent_tokenize(page.extractText())
                for j in range(len(sentences)):
                    sentences[j]=re.sub(r"[Ò¥[email protected]#$%^&|?!':\n\"//]"," ",sentences[j])
                    words=nltk.word_tokenize(sentences[j])
                    newwords=[self.lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
                    sentences[j]=' '.join(newwords)
                #print(sentences)

                paragraph="\n".join(sentences)
                #translation from english to any other language
                if self.k=='yes' or self.k=='y': 
                    translation=self.translator.translate(paragraph)
                    print(translation)
                else:
                    print(paragraph)
                words=nltk.word_tokenize(paragraph)
                tagged_words=nltk.pos_tag(words)
                namedEnt=nltk.ne_chunk(tagged_words)
                print("page "+str(i)+":")
                namedEnt.draw() 
Example 73
Project: mipsqa   Author: google   File: squad_prepro.py    Apache License 2.0 4 votes vote down vote up
def split(example, para2sents_fn=None, positive_augment_factor=0):
  """Splits context in example into sentences and create multiple examples.

  Args:
    example: `dict` object, each element of `get_examples()`.
    para2sents_fn: function that maps `str` to a list of `str`, splitting
      paragraph into sentences.
    positive_augment_factor: Multiply positive examples by this factor.
      For handling class imbalance problem.
  Returns:
    a list of examples, with modified fields: `id`, `context`, `answers` and
    `answer_starts`. Will add `has_answer` bool field.
  """
  if para2sents_fn is None:
    para2sents_fn = nltk.sent_tokenize
  sents = para2sents_fn(example['context'])
  sent_start_idxs = _tokens2idxs(example['context'], sents)

  context = example['context']
  examples = []
  for i, (sent, sent_start_idx) in enumerate(zip(sents, sent_start_idxs)):
    sent_end_idx = sent_start_idx + len(sent)
    e = dict(example.items())  # Copying dict content.
    e['context'] = sent
    e['id'] = '%s %d' % (e['id'], i)
    e['answers'] = []
    e['answer_starts'] = []
    for answer, answer_start in zip(example['answers'],
                                    example['answer_starts']):
      answer_end = answer_start + len(answer)
      if (sent_start_idx <= answer_start < sent_end_idx or
          sent_start_idx < answer_end <= sent_end_idx):
        new_answer = context[max(sent_start_idx, answer_start):min(
            sent_end_idx, answer_end)]
        new_answer_start = max(answer_start, sent_start_idx) - sent_start_idx
        e['answers'].append(new_answer)
        e['answer_starts'].append(new_answer_start)
    if not e['answers']:
      e['answers'].append(NO_ANSWER)
      e['answer_starts'].append(-1)
    e['num_answers'] = len(e['answers'])
    # If the list is empty, then the example has no answer.
    examples.append(e)
    if positive_augment_factor and e['answers'][0] != NO_ANSWER:
      for _ in range(positive_augment_factor):
        examples.append(e)
  return examples 
Example 74
Project: EliIE   Author: Tian312   File: NamedEntityRecognition.py    MIT License 4 votes vote down vote up
def txt2matrix_fortrain(ann_dir,mytrain,tag_included,filename,curpath):
    txt_files=readfromdir.get_file_list(ann_dir,['txt'])
    print "there's "+ str(len(txt_files))+" in total!"

    i=0
    for txt_file in txt_files:
        i+=1

        # read files

        myraw=codecs.open(txt_file).read()
        match=re.search('^(.*)\.txt',txt_file)
        name=match.group(1)
        ann_file=name+'_new.ann'
        print "reading file from",txt_file,ann_file,"..."
        myann=codecs.open(ann_file,"r")
        #print myann
        # output features
        text_tagged=labeling.ann_tagging(myann,myraw,tag_included)
        lines=" ".join(text_tagged.split(r'[;\n]'))
        sents=nltk.sent_tokenize(lines)
        lines=" ### ".join(sents)
        term_list, tag_list,index_list=t2c.txt2conll(lines,1)  # "1" here represents it's a training texts with annoatioin; "0" represents raw texts
        sents=" ".join(term_list).split("###")
        type_list=[]
        pos_list=[]
        # extract umls concepts:
        j=0
        for sent in sents:
            if j>=len(term_list):
                break

            metamap_output=umls_identify.formating_for_metamap(curpath,sent,filename)
            one_sent_term,type_list=umls_identify.label_umls_cui(metamap_output,sent)
            pos_list=POS.pos_tagging(one_sent_term)
            pos_list.append(".")
            type_list.append("O")
            terms=sent.split()
            sent_id=0

            for t in terms:
                if term_list[j]== "###":
                    j=j+1
                term=term_list[j]
                lemma=st.stem(term)
                #vector=word2vec.ouput_embedding(model,term.lower(),50)
                bc=BrownClustering.bc_indexing(term.lower(),bc_index)
                print>> mytrain, term_list[j]+"\t"+lemma+"\t"+pos_list[sent_id]+"\t"+type_list[sent_id]+"\t"+bc+"\t"+index_list[j]+"\t"+tag_list[j]
                sent_id+=1
                j=j+1

            print>>mytrain

    if i%5==0:
        print str(i) +" files finished"

#txt2matrix_fortrain("training","Tempfile/relation.matrix",['Observation','Condition','Drug','Procedure_Device'],) 
Example 75
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def get_isf_idf_dict(vocab):
    isf_fn = os.path.join(PREPROC_DATA_DIR,"newsqa","isf_score_dict")
    idf_fn = os.path.join(PREPROC_DATA_DIR,"newsqa","idf_score_dict")
    if os.path.exists(isf_fn + ".pickle") and os.path.exists(idf_fn + ".pickle"):
        isf_dict = uploadObject(isf_fn)
        idf_dict = uploadObject(idf_fn)
        return isf_dict,idf_dict

    data_dict = pd.read_csv(os.path.join(DATA_CSV_DIR,'train.csv'),encoding='utf-8')

    sid_set = set()
    total_counts = nltk.FreqDist()
    total_counts_doc = nltk.FreqDist()
    nsents = 0
    ndocs = 0
    count = 0
    for sid,content in zip(data_dict["story_id"], data_dict["story_text"]):
        if sid in sid_set:
            continue
        sid_set.add(sid)
        sents = nltk.sent_tokenize(content)
        sents = [sent.split() for sent in sents] # sentences are already tokenized
        nsents += len(sents)
        ndocs += 1
        ref_sents = words_to_id(sents,vocab)
        doc_set = set()
        for sent in ref_sents:
            total_counts.update(set(sent))
            doc_set.update(sent)
        total_counts_doc.update(doc_set)
        if count%10000 == 0:
            print("-->isf_dict_count:",count)
        count +=1
    isf_dict = {}
    idf_dict = {}
    for wid,freq in total_counts.items():
        isf_dict[wid] = isf_score(nsents,freq)
    for wid,freq in total_counts_doc.items():
        idf_dict[wid] = isf_score(ndocs,freq)
    saveObject(isf_dict,isf_fn)
    saveObject(idf_dict,idf_fn)
    return isf_dict,idf_dict 
Example 76
Project: MSMARCO   Author: spacemanidol   File: text_input.py    MIT License 4 votes vote down vote up
def rich_tokenize(text, vocab, c_vocab, update):
    tokens = list(
        itertools.chain.from_iterable(
            (token.replace("''", '"').replace("``", '"')
             for token in word_tokenize(sent))
            for sent in sent_tokenize(text)))
    length = len(tokens)
    mapping = np.zeros((length, 2), dtype='int32')
    c_lengths = np.zeros(length, dtype='int32')
    start = 0
    for ind, token in enumerate(tokens):
        _start = text.find(token, start)
        t_l = len(token)
        if _start < 0 and token[0] == '"':
            t_l = 2
            _a = text.find("''"+token[1:], start)
            _b = text.find("``"+token[1:], start)
            if _a != -1 and _b != -1:
                _start = min(_a, _b)
            elif _a != -1:
                _start = _a
            else:
                _start = _b
        start = _start
        assert start >= 0
        mapping[ind, 0] = start
        mapping[ind, 1] = start + t_l
        c_lengths[ind] = t_l
        start = start + t_l

    if update:
        character_ids = [
            [c_vocab.setdefault(c, len(c_vocab)) for c in token]
            for token in tokens]
        token_ids = [
            vocab.setdefault(token, len(vocab)) for token in tokens]
    else:
        character_ids = [
            [c_vocab.get(c, 1) for c in token]
            for token in tokens]
        token_ids = [
            vocab.get(token, 1) for token in tokens]

    return token_ids, character_ids, length, c_lengths, mapping 
Example 77
Project: TheSuperQuestionTypeTopicClassifier   Author: AmirAhmadHabibi   File: eurlex_data_maker.py    GNU General Public License v3.0 4 votes vote down vote up
def lemmatise_all():
    id_mappings = pd.read_csv('./EurLex_data/eurlex_ID_mappings.csv', sep='\t')

    lemmatiser = WordNetLemmatizer()
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)
    cleaner = re.compile('^\s*-*|-\s*$')

    prog = Progresser(id_mappings.shape[0])

    for i, row in id_mappings.iterrows():
        prog.count()
        try:
            # if file already processed then continue
            if os.path.isfile('./EurLex_data/lem_txt/' + str(row['DocID']) + '-lem.txt'):
                continue

            try:
                with open('./EurLex_data/eurlex_txt/' + str(row['DocID']) + '.txt', 'r', encoding="utf8") as infile:
                    raw_text = infile.read()
            except:
                continue

            lemmatised_doc = ''

            # lemmatise each sentence
            for sent in sent_tokenize(raw_text):
                lemmatised_sent = ''
                tokens_pos = pos_tag(word_tokenize(sent))

                # lemmatise each word in sentence
                for word_pos in tokens_pos:
                    if len(word_pos[0]) < 2: continue

                    word = word_pos[0].lower()
                    word = re.sub(cleaner, '', word)
                    if word in stop_words: continue

                    if len(word) > 2:
                        word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos(word_pos[1]))
                        if word in stop_words: continue

                    lemmatised_sent += word + ' '
                lemmatised_doc += lemmatised_sent + '\n'
            # write doc to file
            with open('./EurLex_data/lem_txt/' + str(row['DocID']) + '-lem.txt', 'w', encoding="utf8") as outfile:
                outfile.write(lemmatised_doc)
        except Exception as e:
            print(e) 
Example 78
Project: Automated-Template-Learning   Author: TallChris91   File: CardConverter.py    GNU General Public License v3.0 4 votes vote down vote up
def sentencefinder(textslist, player, surname, team, otherteam, playerdict, time):
    goalsentences = []
    if len(surname.split()) > 1:
        surname = surname.split()[-1]

    playerlist = []
    for playerid in playerdict:
        otherplayer = playerdict[playerid][1]
        if len(otherplayer.split()) > 1:
            otherplayer = otherplayer.split()[-1]
        if otherplayer != surname:
            playerlist.append(otherplayer)

    for text in textslist:
        relevantsentences = []
        sentences = nltk.sent_tokenize(text)
        for idx, sentence in enumerate(sentences):
            if idx <= 1:
                continue
            words = nltk.word_tokenize(sentence)
            for idx, word in enumerate(words):
                words[idx] = word.lower()

            sentence = ' '.join(words)
            otherplayers = 'n'
            if (re.search(r'\b' + re.escape(surname) + r'\b', sentence, re.IGNORECASE)) and (re.search(r'(\bkaart\b)|(\bprent\b)|(\bgeel\b)|(\bgele\b)|(\brood\b)|(\brode\b)', sentence, re.IGNORECASE)) and not (re.search(r'\d+\-\d+', sentence, re.IGNORECASE)): #Cards
                for othersurname in playerlist:
                    if re.search(re.escape(othersurname), sentence, re.IGNORECASE):
                        otherplayers = 'y'
                        break

                if (otherplayers == 'n') and (len(sentence.split()) <= 30) and (len(sentence.split()) >= 5):
                    relevantsentences.append(sentence)
                #sentence = sentence.replace(player.lower(), '<goal_scorer>')
                #sentence = sentence.replace(surname.lower(), '<goal_scorer>')
                #sentence = sentence.replace(team.lower(), '<team>')
                #sentence = sentence.replace(score, '<score>')
                #sentence = re.sub(r'\d+\-\d+', '<score>', sentence)
                #sentence = sentence.replace(otherteam.lower(), '<other_team>')

        goalsentences.extend(relevantsentences)

    # Remove sentences that are the same
    goalsentences = remove_duplicates(goalsentences)
    return goalsentences 
Example 79
Project: MSMARCO-Question-Answering   Author: microsoft   File: text_input.py    MIT License 4 votes vote down vote up
def rich_tokenize(text, vocab, c_vocab, update):
    tokens = list(
        itertools.chain.from_iterable(
            (token.replace("''", '"').replace("``", '"')
             for token in word_tokenize(sent))
            for sent in sent_tokenize(text)))
    length = len(tokens)
    mapping = np.zeros((length, 2), dtype='int32')
    c_lengths = np.zeros(length, dtype='int32')
    start = 0
    for ind, token in enumerate(tokens):
        _start = text.find(token, start)
        t_l = len(token)
        if _start < 0 and token[0] == '"':
            t_l = 2
            _a = text.find("''"+token[1:], start)
            _b = text.find("``"+token[1:], start)
            if _a != -1 and _b != -1:
                _start = min(_a, _b)
            elif _a != -1:
                _start = _a
            else:
                _start = _b
        start = _start
        assert start >= 0
        mapping[ind, 0] = start
        mapping[ind, 1] = start + t_l
        c_lengths[ind] = t_l
        start = start + t_l

    if update:
        character_ids = [
            [c_vocab.setdefault(c, len(c_vocab)) for c in token]
            for token in tokens]
        token_ids = [
            vocab.setdefault(token, len(vocab)) for token in tokens]
    else:
        character_ids = [
            [c_vocab.get(c, 1) for c in token]
            for token in tokens]
        token_ids = [
            vocab.get(token, 1) for token in tokens]

    return token_ids, character_ids, length, c_lengths, mapping 
Example 80
Project: bookish-invention   Author: Shashankjain12   File: main.py    GNU General Public License v3.0 4 votes vote down vote up
def pngnotes(self):
        
        if self.p[1]=='.png':
            
            img=cv2.imread(self.file_name)
            a=pytesseract.image_to_string(img)
            cv2.imshow('image',img)
            cv2.waitKey()
            #b=a.split()
            #print(b)
            sentences=nltk.sent_tokenize(a)
            word_tags=[]
            
            for i in range(len(sentences)):
                sentences[i]=re.sub(r"[@#$%^&|?!'\"]"," ",sentences[i])
                words=nltk.word_tokenize(sentences[i])
                newwords=[self.lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
                sentences[i]=' '.join(newwords)
                """
                    tagged_words=nltk.pos_tag(newwords)
                for tw in tagged_words:
                    word_tags.append(tw[0]+" "+tw[1])
                tagged_par=" ".join(word_tags)
            namedEnt=nltk.ne_chunk(tagged_words)
            print(namedEnt)
            namedEnt.draw()
            print(tagged_par)
                """
    
            print(sentences) 
            paragraph="\n".join(sentences)
            if self.k=='yes' or self.k=='y':
                translation=self.translator.translate(paragraph)
                print(translation)
            else:
                print(paragraph)
            words=nltk.word_tokenize(paragraph)
            tagged_words=nltk.pos_tag(words)
            namedEnt=nltk.ne_chunk(tagged_words)
            #for i in range(len(namedEnt)):
            #       print(namedEnt[i][1])
            #       print(namedEnt[i][1][i] 
            namedEnt.draw()
            #print(paragraph)