Python nltk.parse.stanford.StanfordDependencyParser() Examples

The following are code examples for showing how to use nltk.parse.stanford.StanfordDependencyParser(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: RITE_zh-CN   Author: laddie132   File: base_processing.py    MIT License 6 votes vote down vote up
def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(self.root_path + 'pos-tagger/chinese-distsim.tagger',
                                           path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
                                           path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
                                               path_to_jar=self.root_path + 'stanford-parser.jar',
                                               path_to_models_jar=self.root_path + 'stanford-parser-3.7.0-models.jar',
                                               encoding='gbk') 
Example 2
Project: nlpvis   Author: shusenl   File: dependencyTree.py    GNU General Public License v2.0 5 votes vote down vote up
def getDependencyTree(self, sentence):
        # return {}
        hashKey = self.hashSentence(sentence)
        if hashKey in self.cache.keys():
            # print "found:", sentence
            return self.cache[hashKey]
        else:
            # path_to_jar = 'data/stanford-corenlp-3.9.1.jar'
            # path_to_models_jar = 'data/stanford-corenlp-3.9.1-models.jar'
            path_to_jar = 'data/stanford-corenlp-3.9.0.jar'
            path_to_models_jar = 'data/stanford-corenlp-3.9.0-models.jar'
            dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

            g = dependency_parser.raw_parse(sentence).next()

            dep_json = []

            for _, node in sorted(g.nodes.items()):
                if node['word'] is not None:
                    for key in node['deps']:
                        if len(node['deps'][key]) == 0:
                            continue
                        else:
                            for v in node['deps'][key]:
                                #the index is not start with 0
                                dep_json.append([node['address']-1, key, v-1])

            self.cache[hashKey] = dep_json

            #print '#####################', dep_json

            #print self.cache

            return dep_json
        #return list(g.triples()) 
Example 3
Project: BAMnet   Author: hugochan   File: webquestions.py    Apache License 2.0 4 votes vote down vote up
def main(fb_path, mid2key_path, data_dir, out_dir):
    HAS_DEP = False
    if HAS_DEP:
        dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # Set CLASSPATH and STANFORD_MODELS environment variables beforehand
    kb = load_ndjson(fb_path, return_type='dict')
    mid2key = load_json(mid2key_path)
    all_split_questions = []
    split = ['factoid_webqa/train.json', 'factoid_webqa/valid.json', 'factoid_webqa/test.json']
    files = [os.path.join(data_dir, x) for x in split]
    missing_mid2key = []

    for f in files:
        data_type = os.path.basename(f).split('.')[0]
        num_unanswerable = 0
        all_questions = []
        data = load_json(f)
        for q in data:
            questions = {}
            questions['answers'] = q['answers']
            questions['entities'] = q['entities']
            questions['qText'] = q['qText']
            questions['qId'] = q['qId']
            questions['freebaseKey'] = q['freebaseKey']
            questions['freebaseKeyCands'] = [q['freebaseKey']]
            for x in q['freebaseMids']:
                if x['mid'] in mid2key:
                    fbkey = mid2key[x['mid']]
                    if fbkey != q['freebaseKey']:
                        questions['freebaseKeyCands'].append(fbkey)
                else:
                    missing_mid2key.append(x['mid'])

            qtext = tokenize(q['qText'])
            if HAS_DEP:
                qw = list(set(qtext).intersection(question_word_list))
                question_word = qw[0] if len(qw) > 0 else ''
                topic_ent = q['freebaseKey']
                dep_path = extract_dep_feature(dep_parser, ' '.join(qtext), topic_ent, question_word)
            else:
                dep_path = []
            questions['dep_path'] = dep_path
            all_questions.append(questions)

            if not q['freebaseKey'] in kb:
                num_unanswerable += 1
                continue
            cand_ans = fetch_ans_cands(kb[q['freebaseKey']])
            norm_cand_ans = set([normalize_answer(x) for x in cand_ans])
            norm_gold_ans = [normalize_answer(x) for x in q['answers']]
            # Check if we can find the gold answer from the candidiate answers.
            if len(norm_cand_ans.intersection(norm_gold_ans)) == 0:
                num_unanswerable += 1
                continue
        all_split_questions.append(all_questions)
        print('{} set: Num of unanswerable questions: {}'.format(data_type, num_unanswerable))

    for i, each in enumerate(all_split_questions):
        dump_ndjson(each, os.path.join(out_dir, split[i].split('/')[-1])) 
Example 4
Project: nlp-example   Author: pmarcis   File: process-text-with-nltk.py    MIT License 4 votes vote down vote up
def main(input_file, output_file, language):
    sys.stderr.write("Starting to process text in file {0} at {1}\n".format(input_file.name,str(datetime.datetime.now())))
    text=input_file.read()
    print "============= Raw text: ============="
    print text
    
    script_path=os.path.dirname(os.path.realpath(__file__)) #needed to figure out the path of dependencies when executing from different directories
    
    #this example is only for English (as ... NLTK has no models for Latvian)
    if language=="en":
        #First, we perform sentence breaking.
        sentences = sent_tokenize(text.strip(), language='english')
        print "============= Sentences: ============"
        print sentences
        #Then, we perform tokenization.
        tokens = [word_tokenize(s, language='english') for s in sentences]
        print "============== Tokens: =============="
        print tokens
        #In some cases (e.g., for indexing and search related tasks) it may be enough to perform stemming of the text.
        #This is, however, not needed nor for tagging, nor for parsing. It is included only as an example.
        stemmer = PorterStemmer(mode='NLTK_EXTENSIONS')
        stemmed_data = [[stemmer.stem(t) for t in s] for s in tokens]
        print "========== Stemmed tokens: =========="
        print stemmed_data
        #Then, we execute the Stanford log linear (maximum entropy-based) part-of-speech tagger
        tagger_jar = os.path.join(script_path,"dependencies","stanford-postagger-2016-10-31","stanford-postagger.jar")
        tagger_model = os.path.join(script_path,"dependencies","stanford-postagger-2016-10-31","models","english-bidirectional-distsim.tagger") 
        pos_tagger = StanfordPOSTagger(tagger_model, tagger_jar, encoding='utf8')
        tagged_data = [pos_tagger.tag(s) for s in tokens]
        print "========= Tagged sentences: ========="
        print tagged_data
        #When the data is tagged, we perform syntactic parsing using the Stanford parser.
        parser_jar = os.path.join(script_path,"dependencies","stanford-parser-full-2016-10-31","stanford-parser.jar")
        parser_model = os.path.join(script_path,"dependencies","stanford-parser-full-2016-10-31","stanford-parser-3.7.0-models.jar")
        parser=StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishFactored.ser.gz", path_to_models_jar=parser_model, path_to_jar=parser_jar)
        parsed_data = parser.tagged_parse_sents(tagged_data)
        #Finally, we print the result to the output file.
        #Note that the Stanford parser deleted all punctuation marks and the data also lacks lemmas.
        #There is a way to get them back - create a class that inherits from StanfordDependencyParser and add "-outputFormatOptions includePunctuationDependencies" to the cmd that executes the parser.
        #... or use the Stanford Neural Dependency Parser instead!
        #For the example, I did not want to overly complicate the code.
        print "========= Parsed sentences: ========="
        for parsed_sentence in parsed_data:
            for dependency_graph in parsed_sentence:
                output_file.write(dependency_graph.to_conll(10))
                print dependency_graph.to_conll(10)
            output_file.write("\n")

    sys.stderr.write("... processing completed at {0}\n".format(str(datetime.datetime.now())))