Python nltk.ne_chunk() Examples

The following are code examples for showing how to use nltk.ne_chunk(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: controcurator   Author: ControCurator   File: summarization.py    MIT License 6 votes vote down vote up
def extract_nltk(comment):
    body = comment['text']
    entities = {}
    sentences = nltk.sent_tokenize(body)
    print(sentences)
    for sentence in sentences:

        words = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(words)
        chunks = nltk.ne_chunk(tagged)
        for chunk in chunks:
            if type(chunk) is nltk.Tree:
              t = ''.join(c[0] for c in chunk.leaves())
              entities[t] = chunk.label()
    #print entities
    return entities 
Example 2
Project: controcurator   Author: ControCurator   File: test.py    MIT License 6 votes vote down vote up
def processLanguage():
    try:
        for item in contentArray:
            tokenized = nltk.word_tokenize(item)
            tagged = nltk.pos_tag(tokenized)
            #print tagged
 
            namedEnt = nltk.ne_chunk(tagged)
            for s in namedEnt:
            	if type(s) is nltk.Tree:
					t = ' '.join(c[0] for c in s.leaves())
					print s.label(), t
            time.sleep(1)
 
    except Exception, e:
        print str(e) 
Example 3
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: Anaphora.py    MIT License 6 votes vote down vote up
def learnAnaphora(self):
        sentences = [
            "John is a man. He walks",
            "John and Mary are married. They have two kids",
            "In order for Ravi to be successful, he should follow John",
            "John met Mary in Barista. She asked him to order a Pizza"
        ]

        for sent in sentences:
            chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary=False)
            stack = []
            print(sent)
            items = tree2conlltags(chunks)
            for item in items:
                if item[1] == 'NNP' and (item[2] == 'B-PERSON' or item[2] == 'O'):
                    stack.append((item[0], self.gender(item[0])))
                elif item[1] == 'CC':
                    stack.append(item[0])
                elif item[1] == 'PRP':
                    stack.append(item[0])
            print("\t {}".format(stack)) 
Example 4
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: PipelineQ.py    MIT License 6 votes vote down vote up
def extractNE():
    while True:
        if queues[1].empty():
            break
        else:
            data = queues[1].get()
            postags = data['input']
            queues[1].task_done()
            chunks = nltk.ne_chunk(postags, binary=False)
            print("  << {} : ".format(data['uuid']), end = '')
            for path in chunks:
                try:
                    label = path.label()
                    print(path, end=', ')
                except:
                    pass
            print() 
Example 5
Project: qa_query   Author: AdamSpannbauer   File: analysis.py    MIT License 6 votes vote down vote up
def ner_extract(text, ne_types=QA_NE_TYPES):
    """Remove non named entities from a string

    :param text: str to remove non named entities from
    :param ne_types: list/set of named entities to keep
    :return: text with non named entities removed
    """
    if ne_types is None:
        ne_types = ALL_NE_TYPES

    chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    ne_list = []
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            if chunk.label() in ne_types:
                full_ne = ' '.join(c[0] for c in chunk)
                ne_list.append(full_ne)

    return ' '.join(ne_list) 
Example 6
Project: ml2015project   Author: sangheestyle   File: nlp.py    MIT License 6 votes vote down vote up
def extract_entities(text, all=True, verbose=False):
    ne_list = []
    tokens = nltk.word_tokenize(text)
    for chunk in nltk.ne_chunk(nltk.pos_tag(tokens)):
        if all:
            if verbose: print(chunk)
        if type(chunk) is nltk.tree.Tree:
            first_word = ' '.join(c[0] for c in chunk.leaves()).split()[0]
            ne_tag = chunk.label()
            ne_list.append([ne_tag, first_word, tokens.index(first_word)])
            if verbose: print(ne_list[-1])
        elif chunk[1] == 'CD':
            first_word = chunk[0]
            ne_list.append(['CD', first_word, tokens.index(first_word)])
            if verbose: print(ne_list[-1])

    return ne_list 
Example 7
Project: youtube-sentiment-helper   Author: dillonmabry   File: utility.py    MIT License 6 votes vote down vote up
def extract_entities(corpus):
    """
    Method to extract key entities from corpus of words
    Returns list of chunked key entities
    Args:
        corpus: the corpus of comments as a single string
    """
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(corpus)
    chunked = ne_chunk(pos_tag(words))
    cont_chunk = []
    curr_chunk = []
    for c in chunked:
            if type(c) == Tree:
                    curr_chunk.append(" ".join([token for token, pos in c.leaves()]))
            elif curr_chunk:
                    named_entity = " ".join(curr_chunk)
                    if named_entity not in cont_chunk:
                            cont_chunk.append(named_entity)
                            curr_chunk = []
            else:
                    continue
    if (len(cont_chunk) > 0):
        return cont_chunk[:10] 
Example 8
Project: atap   Author: foxbook   File: ner.py    Apache License 2.0 6 votes vote down vote up
def get_entities(self, document):
        """
        Extract entities from a single document using the
        nltk.tree.ne_chunk method

        This method is called multiple times by the tranform method

        :param document: a list of lists of tuples
        :return entities: a list of comma-separated strings
        """
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                # classifier chunk the sentences, adds category labels, e.g. PERSON
                trees = ne_chunk(sentence)
                # select only trees with the kinds of entities we want
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            # entities is a list, each entry is a list of entities
                            # for a document
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities 
Example 9
Project: python-scripts   Author: Ventrosky   File: totd-project.py    GNU General Public License v3.0 6 votes vote down vote up
def analisiLing(frasi):
    namedEntityDict = { "PERSON" : [], "GPE" : []}
    tokensPOStot=[]
    for frase in frasi:
        tokens = nltk.word_tokenize(frase)
        tokensPOS = nltk.pos_tag(tokens)
        analisi = nltk.ne_chunk(tokensPOS)
        for nodo in analisi: # ciclo albero scorrendo nodi
            NE = ''
            if hasattr(nodo, 'label'): #controlla se chunk è un nodo intermedio
                if nodo.label() in ["PERSON", "GPE"]: # nomi propri di persona o luogo
                    for partNE in nodo.leaves(): # ciclo foglie nodo selezionato
                        NE = NE+partNE[0]+' '
                    namedEntityDict[nodo.label()].append(NE)
        tokensPOStot = tokensPOStot + tokensPOS 
    return tokensPOStot, namedEntityDict 


# restituisce i bigrammi 
Example 10
Project: python-scripts   Author: Ventrosky   File: project2.py    GNU General Public License v3.0 6 votes vote down vote up
def analisiLing(frasi):
    namedEntityDict = { "PERSON" : [], "GPE" : []}
    tokensPOStot=[]
    for frase in frasi:
        tokens = nltk.word_tokenize(frase)
        tokensPOS = nltk.pos_tag(tokens)
        analisi = nltk.ne_chunk(tokensPOS)
        for nodo in analisi: # ciclo albero scorrendo nodi
            NE = ''
            if hasattr(nodo, 'label'): #controlla se chunk è un nodo intermedio
                if nodo.label() in ["PERSON", "GPE"]: # nomi propri di persona o luogo
                    for partNE in nodo.leaves(): # ciclo foglie nodo selezionato
                        NE = NE+' '+partNE[0]
                    namedEntityDict[nodo.label()].append(NE)
        tokensPOStot = tokensPOStot + tokensPOS 
    return tokensPOStot, namedEntityDict 

# 10 PoS (Part-of-Speech)  più  frequenti;" 
Example 11
Project: That-s-Fake   Author: rajeevdesai   File: main.py    GNU General Public License v3.0 6 votes vote down vote up
def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk 
Example 12
Project: razzy-spinner   Author: rafasashi   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 13
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    MIT License 5 votes vote down vote up
def ne_chunking(targets):
    while True:
        tagged_words = (yield)
        ner_tagged = nltk.ne_chunk(tagged_words)
        for target in targets:
            target.send(ner_tagged) 
Example 14
Project: OpenBottle   Author: xiaozhuchacha   File: relextract.py    MIT License 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 15
Project: OpenBottle   Author: xiaozhuchacha   File: relextract.py    MIT License 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 16
Project: Health-Checker   Author: KriAga   File: relextract.py    MIT License 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 17
Project: geograpy2   Author: Corollarium   File: extraction.py    MIT License 5 votes vote down vote up
def named_entities(self):
        # word_tokenize should work well for most non-CJK languages
        text = nltk.word_tokenize(self.text)
        
        # TODO: this works only for english. Stanford's pos tagger supports
        # more languages
        # http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        # http://stackoverflow.com/questions/1639855/pos-tagging-in-german
        # PT corpus http://aelius.sourceforge.net/manual.html
        # 
        pos_tag = nltk.pos_tag(text)
        
        nes = nltk.ne_chunk(pos_tag)
        return nes 
Example 18
Project: newsclouds-engine   Author: inmagik   File: clouds.py    MIT License 5 votes vote down vote up
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if i in common_articleswords:
            continue

        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    if continuous_chunk:
        named_entity = " ".join(current_chunk)
        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)

    return continuous_chunk 
Example 19
Project: nltk-book-2nd   Author: East196   File: name-entity-recognition.py    Apache License 2.0 5 votes vote down vote up
def process_content():
	for word in tokenized[5:]:
		words = nltk.word_tokenize(word)
		tagged = nltk.pos_tag(words)

		namedEnt = nltk.ne_chunk(tagged, binary=True)

		namedEnt.draw() 
Example 20
Project: jroc   Author: domenicosolazzo   File: NLTKTagger.py    GNU General Public License v3.0 5 votes vote down vote up
def __generate_tree(self, tags):
        """
        Tranform a list of tags in a tree
        """
        from nltk import ne_chunk

        ne_tree = ne_chunk(tags)
        return ne_tree 
Example 21
Project: controcurator   Author: ControCurator   File: article.py    MIT License 5 votes vote down vote up
def getNamedEntities(self):
		entities = {}
		chunks = nltk.ne_chunk(self.tagged)
		for chunk in chunks:
			if type(chunk) is nltk.Tree:
				t = ' '.join(c[0] for c in chunk.leaves())
				cat = chunk.label()
				entities[t] = cat
		return entities 
Example 22
Project: FancyWord   Author: EastonLee   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 23
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: NER.py    MIT License 5 votes vote down vote up
def sampleNE():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent)) 
Example 24
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: NER.py    MIT License 5 votes vote down vote up
def sampleNE2():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent, binary=True)) 
Example 25
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: OwnNE.py    MIT License 5 votes vote down vote up
def demo(samplestrings):
    for s in samplestrings:
        words = nltk.word_tokenize(s)
        tagged = nltk.pos_tag(words)
        # chunks = nltk.ne_chunk(tagged)
        chunks = cp.parse(tagged)
        print(nltk.tree2conllstr(chunks))
        print(chunks) 
Example 26
Project: Fake-News-Suspector   Author: Fake-News-Suspector   File: Keywords.py    MIT License 5 votes vote down vote up
def entities(headline):
    return ne_chunk(pos_tag(word_tokenize(headline))) #More advance categorizing (eg. also gives whether persson name, organization etc) 
Example 27
Project: bookish-invention   Author: Shashankjain12   File: main.py    GNU General Public License v3.0 5 votes vote down vote up
def pdfnotes(self):
        if self.p[1]==".pdf":
            pdfFileObject = open('/home/shashank/Downloads/'+self.file_name, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObject)

            count = pdfReader.numPages
            sentence=[]
            word_tags=[]
            for i in range(count):
                page = pdfReader.getPage(i)
                sentence.append(page.extractText().split('\n'))
                sentences=nltk.sent_tokenize(page.extractText())
                for j in range(len(sentences)):
                    sentences[j]=re.sub(r"[Ò¥[email protected]#$%^&|?!':\n\"//]"," ",sentences[j])
                    words=nltk.word_tokenize(sentences[j])
                    newwords=[self.lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
                    sentences[j]=' '.join(newwords)
                #print(sentences)

                paragraph="\n".join(sentences)
                #translation from english to any other language
                if self.k=='yes' or self.k=='y': 
                    translation=self.translator.translate(paragraph)
                    print(translation)
                else:
                    print(paragraph)
                words=nltk.word_tokenize(paragraph)
                tagged_words=nltk.pos_tag(words)
                namedEnt=nltk.ne_chunk(tagged_words)
                print("page "+str(i)+":")
                namedEnt.draw() 
Example 28
Project: nltk-on-gae   Author: sivu22   File: relextract.py    Apache License 2.0 5 votes vote down vote up
def ne_chunked():
    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    rels = []
    for sent in nltk.corpus.treebank.tagged_sents()[:100]:
        sent = nltk.ne_chunk(sent)
        print(extract_rels('ORG', 'LOC', sent, corpus='ace', pattern = IN)) 
Example 29
Project: cvscan   Author: skcript   File: language_parser.py    MIT License 5 votes vote down vote up
def fetch_name(resume_text):
  tokenized_sentences = nltk.sent_tokenize(resume_text)
  for sentence in tokenized_sentences:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
      if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON':
        chunk = chunk[0]
      (name, tag) = chunk
      if tag == 'NOUN':
        return name

  return "Applicant name couldn't be processed" 
Example 30
Project: atap   Author: foxbook   File: transformers.py    Apache License 2.0 5 votes vote down vote up
def get_entities(self, document):
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                trees = ne_chunk(sentence)
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities 
Example 31
Project: 2.7-NLTK-videos   Author: PythonProgramming   File: nltk6.py    MIT License 5 votes vote down vote up
def processLanguage():
    try:
        for item in contentArray:
            tokenized = nltk.word_tokenize(item)
            tagged = nltk.pos_tag(tokenized)
            print tagged

            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()

            time.sleep(1)

    except Exception, e:
        print str(e) 
Example 32
Project: 2.7-NLTK-videos   Author: PythonProgramming   File: nltk9.py    MIT License 5 votes vote down vote up
def processor(data):
    namedEntArray = []
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)

        entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        #('not', 'RB')
        descriptives = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'', str(tagged))
        if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        else:
            print '_________________________'
            print 'Named:',entities[0]
            print 'Descriptions:'
            for eachDesc in descriptives:
                print eachDesc
            

    except Exception, e:
        print 'failed in the main try of processor'
        print str(e)
        time.sleep(555) 
Example 33
Project: 2.7-NLTK-videos   Author: PythonProgramming   File: nltk8.py    MIT License 5 votes vote down vote up
def processor(data):
    namedEntArray = []
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)

        entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        #('not', 'RB')
        descriptives = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'', str(tagged))
        if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        else:
            print '_________________________'
            print 'Named:',entities[0]
            print 'Descriptions:'
            for eachDesc in descriptives:
                print eachDesc


    except Exception, e:
        print 'failed in the main try of processor'
        print str(e)
        time.sleep(555) 
Example 34
Project: 2.7-NLTK-videos   Author: PythonProgramming   File: nltk10.py    MIT License 5 votes vote down vote up
def processor(data):
    namedEntArray = []
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)

        entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        descriptives = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'', str(tagged))
        if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        else:
            print '_________________________'
            print 'Named:',entities[0]
            print 'Descriptions:'
            for eachDesc in descriptives:
                print eachDesc
                currentTime = time.time()
                dateStamp = datetime.datetime.fromtimestamp(currentTime).strftime('%Y-%m-%d %H:%M:%S')
                namedEntity = entities[0]
                relatedWord = eachDesc
                c.execute("INSERT INTO knowledgeBase (unix, datestamp, namedEntity, relatedWord) VALUES (?,?,?,?)",
                          (currentTime, dateStamp, namedEntity, relatedWord))
                conn.commit()
                

    except Exception, e:
        print 'failed in the main try of processor'
        print str(e)
        time.sleep(55) 
Example 35
Project: Utilities   Author: CodeCorp   File: gensubs.py    MIT License 5 votes vote down vote up
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
	    return namedEnt
#            namedEnt.draw()
    except Exception as e:
        print(str(e))
	return 0 
Example 36
Project: luscan-devel   Author: blackye   File: relextract.py    GNU General Public License v2.0 5 votes vote down vote up
def ne_chunked():
    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    rels = []
    for sent in nltk.corpus.treebank.tagged_sents()[:100]:
        sent = nltk.ne_chunk(sent)
        print extract_rels('ORG', 'LOC', sent, corpus='ace', pattern = IN) 
Example 37
Project: semaeval   Author: axelspringer   File: simple.py    MIT License 5 votes vote down vote up
def extract_entities(text, lang):
	entities={}
	for sent in nltk.sent_tokenize(text):
		for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
			# see http://stackoverflow.com/questions/26352041/nltk-entity-extraction-difference-from-nltk-2-0-4-to-nltk-3-0
			if hasattr(chunk, "label"):
				key=' '.join(c[0] for c in chunk.leaves())
				value=convert_label(chunk.label())
				entities[key]=value 
	return entities 
Example 38
Project: ntlk-examples   Author: athityakumar   File: chunking.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunk_textinput(self,text):
        '''
        :param text:
        :return:
        '''
        tagged_sent = nltk.pos_tag(nltk.word_tokenize(text))
        tagged_sent_new=nltk.ne_chunk(tagged_sent)
        return tagged_sent_new 
Example 39
Project: ntlk-examples   Author: athityakumar   File: chunking.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunk_listinput(self,text_list):
        '''
        :param text_list: [sent1,sent2,..]
        :return:
        '''
        tag_list = get_postag_listinput(text_list)
        chunked_list = [nltk.ne_chunk(sent) for sent in tag_list]
        return chunked_list 
Example 40
Project: ntlk-examples   Author: athityakumar   File: grammar.py    GNU General Public License v3.0 5 votes vote down vote up
def parse_regex_grammar_textinput(self,chunked_sent,grammar=None,loop=1):
        '''
        :param chunked_sent: output of nltk.ne_chunk
        :return:
        '''
        if not grammar:
            grammar = self.grammar
        cp = nltk.RegexpParser(grammar,loop=loop)
        result = cp.parse(chunked_sent)
        return result 
Example 41
Project: nlp-services   Author: singnet   File: entity_recognizer_mod.py    MIT License 5 votes vote down vote up
def nltk_tagger(self, token_text):
        tagged_words = nltk.pos_tag(token_text)
        ne_tagged = nltk.ne_chunk(tagged_words)
        return (ne_tagged)

    # Tag tokens with standard NLP BIO tags 
Example 42
Project: honours_project   Author: JFriel   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 43
Project: honours_project   Author: JFriel   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 44
Project: honours_project   Author: JFriel   File: ner.py    GNU General Public License v3.0 5 votes vote down vote up
def NER(sentence):
    try:
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        entities = nltk.ne_chunk(tagged)
        return entities;
    except:
        print "Something went wrong" 
Example 45
Project: textlytics   Author: laugustyniak   File: document_preprocessing.py    MIT License 5 votes vote down vote up
def extract_entities(self, doc):
        sentence_list = []
        for sent in sent_tokenize(doc):
            sentence_list.append(
                [chunk for chunk in ne_chunk(pos_tag(word_tokenize(sent)))])
        return sentence_list

    # TODO spacy 
Example 46
Project: textlytics   Author: laugustyniak   File: document_preprocessing.py    MIT License 5 votes vote down vote up
def parts_of_speech_flow(self, doc):
        sentences = sent_tokenize(doc)
        tokenized = [word_tokenize(sentence) for sentence in sentences]
        pos_tags = [pos_tag(sentence) for sentence in tokenized]
        return ne_chunk(pos_tags, binary=True) 
Example 47
Project: vdator   Author: werrpy   File: nltk_people.py    MIT License 5 votes vote down vote up
def extract_names(document):
  names = []
  sentences = ie_preprocess(document)
  for tagged_sentence in sentences:
    for chunk in nltk.ne_chunk(tagged_sentence):
      if type(chunk) == nltk.tree.Tree:
        if chunk.label() == 'PERSON':
          names.append(' '.join([c[0] for c in chunk]))
  return names 
Example 48
Project: aop-helpFinder   Author: jecarvaill   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 49
Project: Bro   Author: Zephrys   File: named_entity.py    MIT License 5 votes vote down vote up
def processLanguage(query):
    try:
        tokenized = nltk.word_tokenize(query)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)
        return namedEnt
    except Exception, e:
        print str(e) 
Example 50
Project: serverless-chatbots-workshop   Author: datteswararao   File: relextract.py    Apache License 2.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 51
Project: serverless-chatbots-workshop   Author: datteswararao   File: relextract.py    Apache License 2.0 5 votes vote down vote up
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel))) 
Example 52
Project: Real-Time-Sentiment-Analyzer-of-Twitter-Trends   Author: gauthamkrishna-g   File: stanford_ner_tagger.py    MIT License 5 votes vote down vote up
def nltk_tagger(token_text):
	tagged_words = nltk.pos_tag(token_text)
	ne_tagged = nltk.ne_chunk(tagged_words)
	return ne_tagged 
Example 53
Project: Real-Time-Sentiment-Analyzer-of-Twitter-Trends   Author: gauthamkrishna-g   File: biotagger.py    MIT License 5 votes vote down vote up
def nltk_tagger(token_text):
	tagged_words = nltk.pos_tag(token_text)
	ne_tagged = nltk.ne_chunk(tagged_words)
	return ne_tagged 
Example 54
Project: Real-Time-Sentiment-Analyzer-of-Twitter-Trends   Author: gauthamkrishna-g   File: named_entity_recognition.py    MIT License 5 votes vote down vote up
def process_content():
    try:
        for i in tokenized[:5]:
            tagged = pos_tag(word_tokenize(i)) # tagset='universal'
            namedEnt = ne_chunk(tagged, binary=False)
            print(namedEnt)
            namedEnt.draw()
    except Exception as e:   
        print(str(e)) 
Example 55
Project: bookish-invention   Author: Shashankjain12   File: main.py    GNU General Public License v3.0 4 votes vote down vote up
def pngnotes(self):
        
        if self.p[1]=='.png':
            
            img=cv2.imread(self.file_name)
            a=pytesseract.image_to_string(img)
            cv2.imshow('image',img)
            cv2.waitKey()
            #b=a.split()
            #print(b)
            sentences=nltk.sent_tokenize(a)
            word_tags=[]
            
            for i in range(len(sentences)):
                sentences[i]=re.sub(r"[@#$%^&|?!'\"]"," ",sentences[i])
                words=nltk.word_tokenize(sentences[i])
                newwords=[self.lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
                sentences[i]=' '.join(newwords)
                """
                    tagged_words=nltk.pos_tag(newwords)
                for tw in tagged_words:
                    word_tags.append(tw[0]+" "+tw[1])
                tagged_par=" ".join(word_tags)
            namedEnt=nltk.ne_chunk(tagged_words)
            print(namedEnt)
            namedEnt.draw()
            print(tagged_par)
                """
    
            print(sentences) 
            paragraph="\n".join(sentences)
            if self.k=='yes' or self.k=='y':
                translation=self.translator.translate(paragraph)
                print(translation)
            else:
                print(paragraph)
            words=nltk.word_tokenize(paragraph)
            tagged_words=nltk.pos_tag(words)
            namedEnt=nltk.ne_chunk(tagged_words)
            #for i in range(len(namedEnt)):
            #       print(namedEnt[i][1])
            #       print(namedEnt[i][1][i] 
            namedEnt.draw()
            #print(paragraph) 
Example 56
Project: cvscan   Author: skcript   File: language_parser.py    MIT License 4 votes vote down vote up
def fetch_all_organizations(resume_text):
  organizations = set()
  tokenized_sentences = nltk.sent_tokenize(resume_text)

  # Custom grammar with NLTK
  # NP - Noun Phrase
  # NN - Noun
  # NNP - Proper Noun
  # V - Verb
  # JJ - Adjective

  # In a sentence that contains NN NNNP V NN NN JJ NN.
  # The noun-phrases fetched are:
  # NP: NN NNP
  # NP: NN NN
  # NP: NN

  # Ex, "Application Developer at Delta Force"
  # => ["Application Developer", "Delta Force"]

  grammar = r"""NP: {<NN|NNP>+}"""
  parser = nltk.RegexpParser(grammar)

  avoid_organizations = utilities.get_avoid_organizations()

  for sentence in tokenized_sentences:

    # tags all parts of speech in the tokenized sentences
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))

    # then chunks with customize grammar
    # np_chunks are instances of class nltk.tree.Tree
    np_chunks = parser.parse(tagged_words)
    noun_phrases = []

    for np_chunk in np_chunks:
      if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
        # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
        noun_phrase = ""
        for (org, tag) in np_chunk.leaves():
          noun_phrase += org + ' '

        noun_phrases.append(noun_phrase.rstrip())

    # Using name entity chunker to get all the organizations
    chunks = nltk.ne_chunk(tagged_words)
    for chunk in chunks:
      if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
        (organization, tag) = chunk[0]

        # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
        # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
        for noun_phrase in noun_phrases:
          if organization in noun_phrase and organization not in avoid_organizations:
            organizations.add(noun_phrase.capitalize())

  return organizations 
Example 57
Project: news-audit   Author: clips   File: source_checker.py    GNU General Public License v3.0 4 votes vote down vote up
def get_queries(self):

		"""Function to extract search queries from the text: 
		breaks text into ngrams, filters ngrams that consist mostly of stopwords or named entities, 
		selects an evenly spaced sample of the remaining ngrams"""

		text = self.text
		beg_quotes = re.findall(r'\"\S', text)
		for each in beg_quotes:
			text = text.replace(each, 'BEGQ' + each[-1])

		end_quotes = re.findall(r'\S\"', text)
		for each in end_quotes:
			text = text.replace(each, each[0] + 'ENDQ')

		text = re.sub('(ENDQ)+', 'ENDQ', text)
		text = re.sub('(BEGQ)+', 'BEGQ', text)
		text = text.replace('--', 'DOUBLEDASH')

		all_ngrams = ngrams(text, n = self.span, punctuation = "", continuous = True)
		if self.language in stopwords.fileids():
			stop_words = stopwords.words(self.language)
		else:
			stop_words = []	
		queries = []
		for ngram in all_ngrams:
			num_stop = len([w for w in ngram if w in stop_words])
			stop_score = float(num_stop)/len(ngram)
			if self.language == 'english':
				chunked = ne_chunk(pos_tag(ngram))
				named_entities = [[w for w, t in elt] for elt in chunked if isinstance(elt, nltk.Tree)]
				num_ent = sum([len(ent_list) for ent_list in named_entities])
				ent_score = float(num_ent)/len(ngram)
			else:
				ent_score = 0

			if stop_score < self.threshold and ent_score < self.threshold:
				r_string = self.reconstruct_ngram(ngram)
				if r_string in self.text:
					queries.append(r_string)

		reduction = len(queries)/self.max_queries
		return queries[0::reduction]