Python nltk.tokenize.sent_tokenize() Examples

The following are 30 code examples of nltk.tokenize.sent_tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize , or try the search function .
Example #1
Source File: DataReader.py    From MachineLearningSamples-BiomedicalEntityExtraction with MIT License 6 votes vote down vote up
def get_feature_vectors_1 (self, data_list):        

        print("Reading unlabeled data from dataframe")   
        # list of list of tokens
        all_sentences_words = []           

        # Process all lines in the file
        for line in data_list:
            text = line.strip()                                

            #break the input text into sentences before tokenization
            sentences = sent_tokenize(text)
            
            for sent in sentences:
                sentence_words = nltk.word_tokenize(sent)                             
                all_sentences_words.append( tuple(sentence_words) )                                                                                             
        
        self.n_sentences_all = len(all_sentences_words)        
        print("number of unlabeled examples = {}".format(self.n_sentences_all))
        return self.create_feature_vectors(all_sentences_words)

    ################################################## 
    #   create_feature_vectors
    ################################################## 
Example #2
Source File: do_sentence_segmentation.py    From training with Apache License 2.0 6 votes vote down vote up
def process_one_file(one_input):
  """Separate paragraphs into sentences, for one file."""
  input_filename = one_input + args.input_suffix
  output_filename = one_input + args.output_suffix
  logging.info('Processing %s => %s', input_filename, output_filename)
  with io.open(input_filename, 'r', encoding='utf-8') as fin:
    with io.open(output_filename, 'w', encoding='utf-8') as fout:
      for line in fin:
        if len(line) == 1:
          fout.write(u'\n')
        sents = sent_tokenize(line)
        for sent in sents:
          sent_str = sent.strip()
          # if sent_str:
          fout.write('%s\n' % sent_str)
      fout.write(u'\n') 
Example #3
Source File: doc2vec.py    From broca with MIT License 6 votes vote down vote up
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = tokenizer(sent)
                        yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
                else:
                    tokens = tokenizer(line)
                    yield LabeledSentence(tokens, ['SENT_{}'.format(i)]) 
Example #4
Source File: batcher.py    From docker with MIT License 6 votes vote down vote up
def fill_example_queue(self, data_path, mode = "test"):

        new_queue =[]

        filelist = glob.glob(data_path)  # get the list of datafiles
        assert filelist, ('Error: Empty filelist at %s' % data_path)  # check filelist isn't empty
        filelist = sorted(filelist)
        if mode == "train":
            filelist = filelist

        for f in filelist:


            reader = codecs.open(f, 'r', 'utf-8')
            while True:
                string_ = reader.readline()
                if not string_: break
                dict_example = json.loads(string_)
                review = dict_example["review"]
                if(len(sent_tokenize(review))<2):
                    continue
                example = Example(review, self._vocab, self._hps)
                new_queue.append(example)
        return new_queue 
Example #5
Source File: prepare_clc_fce_data.py    From NLP_Toolkit with Apache License 2.0 6 votes vote down vote up
def main():
    fce = convert_fce(args.fce_dataset_path)
    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
        for doc in tqdm(fce, unit='doc'):
            sents = re.split(r"\n +\n", doc)
            for sent in sents:
                tokenized_sents = sent_tokenize(sent)
                for i in range(len(tokenized_sents)):
                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
                        tokenized_sents[i] = ""
                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
                    original = re.sub(regexp, r"\1", tokenized_sents[i])
                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
                    # filter out nested alerts
                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
                            and not re.search(r"[{}=]", applied):
                        out_original.write(" ".join(word_tokenize(original)) + "\n")
                        out_applied.write(" ".join(word_tokenize(applied)) + "\n") 
Example #6
Source File: corpus_cleaner.py    From acl2017-interactive_summarizer with Apache License 2.0 6 votes vote down vote up
def parse_xml_all(self, data_file, doc_type, language='english'):
        e = ET.parse(data_file)
        cluster_data = {}
        root = e.getroot()
        for topics in root:
            data = []
            topic_id = topics.attrib.get('id')
            for documents in topics.findall(doc_type):
                doc_id = documents.attrib.get('id')
                if doc_type == 'document':
                    title_text = documents.find('title').text
                doc_text = documents.find('text').text
                text = text_normalization(doc_text)
                doc_sents = sent_tokenize(text, language)
                data.append([doc_id, doc_sents])
            cluster_data[topic_id] = data
        return cluster_data 
Example #7
Source File: summarizer.py    From delbot with GNU Affero General Public License v3.0 6 votes vote down vote up
def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx] 
Example #8
Source File: kaggle18.py    From modin with Apache License 2.0 6 votes vote down vote up
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try:
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        text = regex.sub(" ", text)  # remove punctuation
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]
        return filtered_tokens
    except TypeError as e:
        print(text, e) 
Example #9
Source File: word2vec.py    From Bidirectiona-LSTM-for-text-summarization- with MIT License 6 votes vote down vote up
def createCorpus(t):
    corpus = []
    all_sent = []
    for k in t:
        for p in t[k]:
            corpus.append(st(p))
    for sent in range(len(corpus)):
        for k in corpus[sent]:
            all_sent.append(k)
    for m in range(len(all_sent)):
        all_sent[m] = wt(all_sent[m])
    
    all_words=[]
    for sent in all_sent:
        hold=[]
        for word in sent:
            hold.append(word.lower())
        all_words.append(hold)
    return all_words 
Example #10
Source File: Word_Frequency_Summarization.py    From nlp-akash with MIT License 6 votes vote down vote up
def run_summarization(text):
    # 1 Create the word frequency table
    freq_table = _create_frequency_table(text)

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''

    # 2 Tokenize the sentences
    sentences = sent_tokenize(text)

    # 3 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(sentences, freq_table)

    # 4 Find the threshold
    threshold = _find_average_score(sentence_scores)

    # 5 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)

    return summary 
Example #11
Source File: DataReader.py    From MachineLearningSamples-BiomedicalEntityExtraction with MIT License 6 votes vote down vote up
def get_feature_vectors_2 (self, data_file):        

        print("Loading unlabeled data from file {}".format(data_file))
        with open(data_file, 'r') as f_data:                                    
            all_sentences_words = []
                 

            # Process all lines in the file
            for line in f_data:
                text = line.strip()                                

                #break the input text into sentences before tokenization
                sentences = sent_tokenize(text)
                
                for sent in sentences:
                    sentence_words = nltk.word_tokenize(sent)                             
                    all_sentences_words.append( tuple(sentence_words) )                                                           
        
        self.n_sentences_all = len(all_sentences_words)   
        print("number of unlabeled examples = {}".format(self.n_sentences_all))
        return self.create_feature_vectors(all_sentences_words)

    ##################################################
    #  get_feature_vectors_1  
    ################################################## 
Example #12
Source File: labled_tsv_to_tfrecords_single_sentences.py    From bran with Apache License 2.0 6 votes vote down vote up
def convert_to_single_sentence(doc_str, e1_start, e1_end, e2_start, e2_end, annotation_map):
    offsets = zip(e1_start+e2_start, e1_end+e2_end, [1]*len(e1_start)+[2]*len(e2_start))
    offsets = sorted(offsets, key=lambda tup: tup[0])
    replaced_doc_str = [process_single_annotation(doc_str, 0, s, e, annotation_map, i, ent_id) if i == 0
                        else
                        process_single_annotation(doc_str, offsets[i-1][1], s, e, annotation_map, i, ent_id)
                        for i, (s, e, ent_id) in enumerate(offsets)]

    replaced_doc_str.append(' '.join(doc_str[offsets[-1][1]:]))
    new_doc_str = ''.join(replaced_doc_str)

    ## TODO only works for data with single e1 and e2 mention
    sentences = sent_tokenize(new_doc_str.replace('@@ ', '').decode('utf-8'))
    tokenized_sents = [tokenize(s) for s in sentences]
    chosen_sent = [i for i, s in enumerate(sentences) if s.count(ENTITY_STRING) >= 2]
    if chosen_sent:
        if FLAGS.full_abstract:
            replaced_sent = [annotation_map[w] if w in annotation_map else w for s in tokenized_sents for w in s]
        else:
            idx = chosen_sent[0]
            s_idx = max(0, idx - FLAGS.sentence_window)
            e_idx = min(idx + FLAGS.sentence_window+1, len(tokenized_sents))
            window_sentences = [tokenized_sents[i] for i in (range(s_idx, e_idx))]
            replaced_sent = [annotation_map[w] if w in annotation_map else w for s in window_sentences for w in s]
        return replaced_sent 
Example #13
Source File: extras.py    From semeval2017-scienceie with Apache License 2.0 6 votes vote down vote up
def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens 
Example #14
Source File: NewsArticleClass.py    From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0 6 votes vote down vote up
def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST 
Example #15
Source File: util.py    From camr with GNU General Public License v2.0 6 votes vote down vote up
def find_abr_fullname(doc,query,Num):
    """Find the query(abbreviation's) full name within the document.
       Parameters:
       doc: the document to be searched for(specified format) 
       query: the abbreviation
       Num: the number of sentences before the query to be looked for fullname
       (here we asume that all the fullname of the query appeared before the query)
    """
    sents = [word_tokenize(t) for t in sent_tokenize(doc)]
    for i,sent in enumerate(sents):
        if query in sent:
            fullname = find_abr_fn(sent,query)
            if fullname != -1:
                return fullname
            else:
                j = 1
                while i-j >= 0 and j <= Num: 
                    if find_abr_fn(sent[i-j],query) == -1:
                        j+=1
                    else:
                        return find_abr_fn(sent[i-j],query)
                
    raise Exception('No query in the document.') 
Example #16
Source File: NewsArticleClass.py    From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0 6 votes vote down vote up
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get) 
Example #17
Source File: background.py    From language with Apache License 2.0 6 votes vote down vote up
def score_sentences(query,
                    doc_json,
                    entity,
                    sentence_scores,
                    max_sentence_len,
                    n=3):
  """Score sentences with respect to the query."""
  sentences = tokenize.sent_tokenize(doc_json['text'])
  query_ngrams = util.get_ngrams(tokenize.word_tokenize(query), n)
  for sentence in sentences:
    sentence_tokens = tokenize.word_tokenize(sentence)
    tokens = tokenize.word_tokenize(
        entity['wikipedia_name']) + [':'] + sentence_tokens[:max_sentence_len]
    sentence_ngrams = util.get_ngrams(tokens, n)
    score = len(set(sentence_ngrams).intersection(query_ngrams)) / max(
        1, len(query_ngrams))
    sentence_scores.append((entity, sentence_tokens), score) 
Example #18
Source File: create_pretraining_data.py    From language with Apache License 2.0 6 votes vote down vote up
def split_into_sentences(text, doc_annotations, tokenizer):
  """Split into sentences and return bookkeeping info."""
  sentences = []
  sentences_starts = []
  sentence_annotations = []
  doc_annotations = sorted(doc_annotations, key=lambda x: x[2])
  annotation_idx = 0
  sentences_text = tokenize.sent_tokenize(text)
  token_idx = 0
  for sentence_text in sentences_text:
    sub_tokens, word_starts = tokenizer.tokenize(sentence_text)
    sentences.append(sub_tokens)
    sentences_starts.append(word_starts)
    sentence_annotations.append([])
    token_idx += len(sentence_text.split(" "))
    while annotation_idx < len(
        doc_annotations) and doc_annotations[annotation_idx][2] < token_idx:
      sentence_annotations[-1].append(doc_annotations[annotation_idx])
      annotation_idx += 1
  return sentences, sentences_starts, sentence_annotations 
Example #19
Source File: preprocess.py    From serapis with MIT License 6 votes vote down vote up
def paragraph_to_sentences(paragraph, term):
    """
    Turns a paragraph into clean, preprocessed sentences
    """
    result = []
    paragraph = re.sub(r"([^ ])([\(\[\"])", r"\1 \2", paragraph)  # Give brackets space to breathe
    paragraph = re.sub(r"([\)\]\"\!\?:])([^ ])", r"\1 \2", paragraph)
    paragraph = re.sub(r"([^. ]{3})\.([^. ]{3}|A |An )", r"\1. \2", paragraph)
    paragraph = re.sub(r" e\.?g\.? ", " _eg_ ", paragraph)  # sent_tokenize improperly splits sentences here
    paragraph = re.sub(r" i\.?e\.? ", " _ie_ ", paragraph)
    sentences = sent_tokenize(paragraph)
    for sentence in sentences:
        sentence = sentence.replace("_eg_", "_e.g._").replace("_ie_", "i.e.")  # reverts edge case
        processed = preprocess_sentence(sentence, term)
        if qualify_sentence(processed):
            result.append(processed)
    return result


# Sentences
######################## 
Example #20
Source File: text_summarizer.py    From nlp_url_summarizer with MIT License 6 votes vote down vote up
def summarize(self, text, n):
    """
      Return a list of n sentences 
      which represent the summary of text.
    """
    sents = sent_tokenize(text)
    assert n <= len(sents)
    word_sent = [word_tokenize(s.lower()) for s in sents]
    self._freq = self._compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i,sent in enumerate(word_sent):
      for w in sent:
        if w in self._freq:
          ranking[i] += self._freq[w]
    sents_idx = self._rank(ranking, n)    
    return [sents[j] for j in sents_idx] 
Example #21
Source File: textsum_data_convert.py    From TextSum with MIT License 6 votes vote down vote up
def _convert_files_to_binary(input_filenames, output_filename):
  with open(output_filename, 'wb') as writer:
    for filename in input_filenames:
      with open(filename, 'r') as f:
        document = f.read()
    
      document_parts = document.split('\n', 1)
      assert len(document_parts) == 2
    
      title = '<d><p><s>' + document_parts[0] + '</s></p></d>'
      
      body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ')
      sentences = sent_tokenize(body)
      body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>'
      body = body.encode('utf8')
    
      tf_example = example_pb2.Example()
      tf_example.features.feature['article'].bytes_list.value.extend([body])
      tf_example.features.feature['abstract'].bytes_list.value.extend([title])
      tf_example_str = tf_example.SerializeToString()
      str_len = len(tf_example_str)
      writer.write(struct.pack('q', str_len))
      writer.write(struct.pack('%ds' % str_len, tf_example_str)) 
Example #22
Source File: vocabulary.py    From topicModelling with GNU General Public License v3.0 6 votes vote down vote up
def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        window = 150
#        doc = doc.replace("&ndash;", " ")
#        doc = sent_tokenize(doc)
        for sentence in doc:
            miniArray = []
            for term in sentence:
                id = self.term_to_id(term, training)    
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1
            if not len(miniArray):
                continue
            if len(miniArray)  > window:
                l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
            else:
                l.append(np.array(miniArray))
        return l 
Example #23
Source File: Article.py    From find-all-the-new-words with MIT License 5 votes vote down vote up
def out_put_important_sentences(self):
        pp_m_article = re.sub(r"\n",r".\n",self.marked_article)
        sentences = tokenize.sent_tokenize(pp_m_article)
        i_sentences = [_ if self.pattern.search(_) else None for _ in sentences]
        write_important_sentances_to_file("./others/",self.name, "\n\n".join(list(filter(None,i_sentences)))) 
Example #24
Source File: main.py    From Python-DevOps with MIT License 5 votes vote down vote up
def tokenizer(string):
    return [word_tokenize(t) for t in sent_tokenize(s)] 
Example #25
Source File: vocabulary_sentenceLayer.py    From topicModelling with GNU General Public License v3.0 5 votes vote down vote up
def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        doc_sents = sent_tokenize(doc)
        for sentence in doc_sents:
            miniArray = []
            for term in sentence.split():
                id = self.term_to_id(term, training)
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
            l.append(np.array(miniArray, dtype=np.int32))
        return l 
Example #26
Source File: preprocess.py    From BERT-pytorch with The Unlicense 5 votes vote down vote up
def detect_sentences(raw_documents_path, sentences_detected_path, **_):
    with open(raw_documents_path) as raw_documents_file, open(sentences_detected_path, 'w') as sentences_detected_file:
        for line in tqdm(raw_documents_file):
            sentences = sent_tokenize(line.strip())
            tokenized_sentences = []
            for sentence in sentences:
                sentence = sentence.lower()
                sentence = NUMBERS.sub('N', sentence)
                tokens = [match.group() for match in TOKENIZATION.finditer(sentence)]
                if not tokens:
                    continue
                tokenized_sentences.append(' '.join(tokens))

            output_line = '|'.join(tokenized_sentences) + '\n'
            sentences_detected_file.write(output_line) 
Example #27
Source File: raw_books_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def split_line_by_sentences(line):
  return sent_tokenize(line) 
Example #28
Source File: books_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def split_line_by_sentences(line):
  return sent_tokenize(line) 
Example #29
Source File: summarize.py    From Django-Bookworm with MIT License 5 votes vote down vote up
def __init__(self, text):
        self.__text        = text
        self.__stop_words  = stopwords.words('english')
        self.__sentence    = sent_tokenize(text)
        self.__f_text      = self.create_formatted_text()
        self.__word_freq   = self.calc_word_frequencies() 
Example #30
Source File: phrases.py    From broca with MIT License 5 votes vote down vote up
def _phrase_doc_stream(paths, n, tokenizer=word_tokenize):
    """
    Generator to feed sentences to the phrase model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)
                for sent in sent_tokenize(line.lower()):
                    tokens = tokenizer(sent)
                    yield tokens