import networkx as nx
import os
import codecs

from pycorenlp import StanfordCoreNLP
import matplotlib.pyplot as plt
import re
import operator
import json
from collections import OrderedDict
import logging
# custom
from classes import Stats, Annotations
from grobid_quantities.quantities import QuantitiesClient

# Globals
basedir = os.path.abspath(os.path.dirname(__file__))
stats = Stats()
A = None  # global annotations object
Num = None  # global sentence object
G = None  # global dependency tree object

                    format='%(asctime)s %(levelname)-8s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename=os.path.join(basedir, 'measurement.log'))


def _build_graph(show=False):
    """Load word dependencies into graph using networkx. Enables easy traversal of dependencies for parsing particular patterns.
    One graph is created for each sentence.

        show (bool): If set to True, labeled visualization of network will be opened via matplotlib for each sentence

        None: Global variable G is set from within function

    global G
    G = nx.Graph()
    node_labels, edge_labels = {}, {}
    for idx, dep in enumerate(A.deps):

        types = ["dependent", "governor"]

        # nodes, labels
        for x in types:
            G.add_node(str(dep[x]), word=dep[x + "Gloss"], pos=A.lookup[dep[x]]["pos"])
            node_labels[str(dep[x])] = dep[x + "Gloss"] + " : " + A.lookup[dep[x]]["pos"]

        # edges, labels
        G.add_edge(str(dep[types[0]]), str(dep[types[1]]), dep=dep["dep"])
        edge_labels[(str(dep[types[0]]), str(dep[types[1]]))] = dep["dep"]

    if show == True:
        pos = nx.spring_layout(G)
        nx.draw_networkx(G, pos=pos, labels=node_labels, node_color="white", alpha=.5)
        nx.draw_networkx_edge_labels(G, pos=pos, edge_labels=edge_labels)

# Dependency / POS parsing functions

def _get_connected(edge, idx):
    """If an edge connects to a node (word), return the index of the node

        edge (tuple): Contains token indices of two connect words and the dependency type between them - e.g. ('11', '14', {'dep': 'nmod:at'})
        idx (int): Token index of word

        str or None: str if connected word is found in provided edge, None if not
    if str(edge[0]) == str(idx) and A.lookup[int(edge[1])]["word"] != Num:
        return edge[1]
    elif str(edge[1]) == str(idx) and A.lookup[int(edge[0])]["word"] != Num:
        return edge[0]

def _get_cousin(sibling_idx, dep_type_list, visited_nodes={}):
    """Find a second degree relation within the dependency graph.
    Used to find subject in a sentence when the measurement unit is a direct object, for example.

        sibling_idx (str): Token index of the sibling node through which to find the cousin
        dep_type_list (list): Allowable dependency types connecting sibling to cousin

        list: cousin words meeting POS and dependency criteria
    words = []  # Visited nodes prevent recursion from bouncing between two "VB" nodes

    for dep_type in dep_type_list:
        for edge in G.edges(data=True):

            cousin_idx = _get_connected(edge, sibling_idx)

            allowed_pos = ["NN", "PR"]
            if cousin_idx and dep_type in edge[2]['dep'] and any(
                            x in A.lookup[int(cousin_idx)]['pos'] for x in allowed_pos):

            # Go to second cousin if cousin is a verb
            elif cousin_idx and dep_type in edge[2]['dep'] and "VB" in A.lookup[int(cousin_idx)]['pos'] and (
                not cousin_idx in visited_nodes or visited_nodes[cousin_idx] < 2):
                words.extend(_get_cousin(cousin_idx, ["nsubj", "nsubjpass", "acl"], visited_nodes=visited_nodes))

            if cousin_idx:
                if cousin_idx in visited_nodes:
                    visited_nodes[cousin_idx] += 1
                    visited_nodes[cousin_idx] = 1
    return set(words)

def _add_related(related, dep, all_related, index, connector=None):
    """Adds a word (and its metadata) related to a measurement to the list of all related words for that measurement

        related (str): related token/word
        dep (str): the dependency type connecting the unit to the related word
        all_related (list): existing list of "related" objects to be augmented
        index (str): token index of related word
        connector (str): if related word is cousin of unit (not sibling) then connecter is word between

        list: all related words for a given measurement (augmented with new 'related' passed in)
    doc = {}
    doc["relationForm"] = dep
    doc["rawName"] = related
    doc["tokenIndex"] = int(index)
    doc["offsetStart"] = A.lookup[int(index)]["start"]
    doc["offsetEnd"] = A.lookup[int(index)]["end"]
    doc["connector"] = "" if connector is None else connector
    if not doc in all_related:
    return all_related

def _add_descriptors(related):
    """For related words found for a measurement (usually nouns), add any connected adjectives, compounds, or modifiers.

        related (list): objects containing related words and their metadata

        list: original list of related objects augmented with additional descriptor words

    for r in related:
        r["descriptors"] = []
        for edge in G.edges(data=True):
            sibling_idx = _get_connected(edge, r["tokenIndex"])
            if sibling_idx and (A.lookup[int(sibling_idx)]["pos"] == "JJ" or edge[2]["dep"] in ["amod", "compound"]):
                        "tokenIndex": sibling_idx,
                        "rawName": A.lookup[int(sibling_idx)]["word"]

            if sibling_idx and "NN" in A.lookup[int(sibling_idx)]["pos"] and "amod" in edge[2]["dep"]:
                additional_related = _get_cousin(sibling_idx, ["nmod"])
                for add in set(additional_related):
                    related = _add_related(add, "nmod", related, A.index_lookup[add],
    return related

def _check_criteria(dep, dep_obj, all_related, edge, sibling_idx):
    """ If measurement is found, runs processed sentence through valid dependency patterns
        (from JSON file) to find additional words related to measurements

        dep (str): dependency type present in dependency patterns JSON
        dep_obj (dict): object containing accepted POS types and measurement formats for a given dependency type
        all_related (list): contains words related to a measurement to be augmented if valid pattern is found
        edge (list): Connected node (word) indices and dependency type between
        sibling_idx (str): token index of word connected to unit

        list: related words and metadata
    # Check for a matching dependency type
    related = []

    if edge[2]["dep"] == dep:
        # Check for matching POS type(s)
        for pos_logic in dep_obj.keys():
            connector = None

            if isinstance(dep_obj[pos_logic], dict):
                for pos in dep_obj[pos_logic].keys():

                    # Check for allowed part of speech tags in matched dependency patterns
                    if (pos_logic == "pos_in" and pos in G.node[sibling_idx]["pos"]) or (
                            pos_logic == "pos_equals" and pos == G.node[sibling_idx]["pos"]):
                    elif pos_logic == "pos_not":
                        if not [False if not_pos == G.node[sibling_idx]["pos"] else True for not_pos in
                                dep_obj.keys()]: continue

                    # if no additional checks, have a match
                    if dep_obj[pos_logic][pos] == None or any(
                                    y in dep_obj[pos_logic][pos] for y in [None, "add_sibling"]):
                        all_related = _add_related(G.node[sibling_idx]['word'], dep, all_related,

                    # if additional checks are required, process further
                    if dep_obj[pos_logic][pos]:
                        if "get_cousin" in dep_obj[pos_logic][pos]:
                            related.extend(_get_cousin(sibling_idx, dep_obj[pos_logic][pos]["get_cousin"]))
                            connector = G.node[sibling_idx]['word']

                        if "special" in dep_obj[pos_logic][pos]:
                            if dep == "compound" and pos == "NN":
                                related = [G.node[sibling_idx]['word']]

                        if None in related:

                        # Allows for getting cousin and returning sibling
                        if "else" in dep_obj[pos_logic][pos].keys() and dep_obj[pos_logic][pos]["else"] == "always":
                            all_related = _add_related(G.node[sibling_idx]['word'], dep, all_related,
                                                       A.index_lookup[G.node[sibling_idx]['word']], connector=connector)
                        if len(related) > 0 and isinstance(related, list):
                            for x in related:
                                if x != None:
                                    all_related = _add_related(x, dep, all_related, A.index_lookup[x],
                        elif "else" in dep_obj[pos_logic][pos].keys() and dep_obj[pos_logic][pos]["else"] == True:
                            all_related = _add_related(G.node[sibling_idx]['word'], dep, all_related,
                                                       A.index_lookup[G.node[sibling_idx]['word']], connector=connector)

    return all_related

def _parse_patterns(unit_idx, measurement_format, patterns_file):
    """ Loads depedency patters JSON file and uses "_check_criteria" to look for words related to measurement (connected via unit token)

        unit_idx (list): index or indices of measurement unit token(s)
        measurement_format (str): indicates form of measurement value + unit (attached: 10m, space between: 10 m, hyphenated: 10-m)

        list: related words and metadata

    all_related = []

    for edge in G.edges(data=True):
        for idx in unit_idx:
            sibling_idx = _get_connected(edge, idx)
            if sibling_idx:
                with open(os.path.join(basedir, patterns_file), "r") as tree:
                    tree = json.load(tree)

                    for dep in tree["dep"].keys():
                        if tree["dep"][dep]["enhanced"] == True:
                            for inner_dep in tree["dep"][dep].keys():
                                if isinstance(tree["dep"][dep][inner_dep], dict) and measurement_format in \
                                    full_dep = dep + ":" + inner_dep
                                    full_dep_obj = tree["dep"][dep][inner_dep]
                                    all_related = _check_criteria(full_dep, full_dep_obj, all_related, edge,

                        elif measurement_format in tree["dep"][dep]["measurement_types"]:
                            all_related = _check_criteria(dep, tree["dep"][dep], all_related, edge, sibling_idx)

                    for x in range(0, len(tree["word"]["or"])):
                        if G.node[sibling_idx]["word"] == tree["word"]["or"][x]:
                            related = _get_cousin(sibling_idx, ["nsubj"])
                            for r in related:
                                all_related = _add_related(r, "operator", all_related, A.index_lookup[r])

    all_related = _add_descriptors(all_related)

    return all_related

def _get_related(stats, match, patterns_file):
    """ Calls _parse_patterns() to get words related to a measurement and provides de-duplication between related words and grobid response

        stats (Stats object): Global object used to track parsing behaviors
        match (dict): information on measurements and units extracted by Grobid

        list: related words and metadata
    all_related = None
    measurement_formats = ["space_between", "attached", "hyphenated"]

    all_related = _parse_patterns(match["unit_idx"], match["measurement_format"], patterns_file)
    if all_related == None:
        all_related = _parse_patterns(match["unit_idx"], ["uncertain"], patterns_file)

    # get words like approximately
    num_adverbs = _parse_patterns([match["num_idx"]], match["measurement_format"], patterns_file)
    unit_adverbs = _parse_patterns([match["unit_idx"]], match["measurement_format"], patterns_file)
    adverbs = num_adverbs + unit_adverbs
    for_removal = []
    for a in adverbs:
        if a["relationForm"] != "advmod":
            [a.pop(key, None) for key in ["descriptors", "connector"]]  # not relevant for adverbs
    [adverbs.remove(a) for a in for_removal]

    if adverbs:
        match["grobid"]["adverbs"] = adverbs

    # Check to make sure related isn't already a number, unit, or quantified thing identified by Grobid
    potential_keys = ["quantity", "quantityLeast", "quantityMost", "quantified"]

    if all_related:
        for key in potential_keys:
            for related in all_related:
                if key in match["grobid"]:
                    num, unit, quantified = "", "", ""
                    if "rawValue" in match["grobid"][key]: num = match["grobid"][key]["rawValue"]
                    if "rawUnit" in match["grobid"][key]: unit = match["grobid"][key]["rawUnit"]["name"]
                    if "normalizedName" in match["grobid"][key]: quantified = match["grobid"][key]["normalizedName"]

                    if related["rawName"] in [num, unit, quantified] or related["rawName"] == num + unit or (
                            quantified in related["rawName"] and not quantified == ""):

                        if related["rawName"] == unit:
                            for k in related.keys():
                                if not k in match["grobid"][key]["rawUnit"]:
                                    match["grobid"][key]["rawUnit"][k] = related[k]

                        elif related["rawName"] == quantified:
                            for k in related.keys():
                                if not k in match["grobid"][key]:
                                    match["grobid"][key][k] = related[k]
    return all_related

def _simplify_results(match):
    """WORK IN PROGRESS: Prune metadata from extracted measurements and related words for more readable output

        match (dict): Object contatining all metadata about extraction types, locations, relationships within sentence

        list: contains 4 items, extracted numeric value or range (list), unit(s) (list), qunatified words identified by Grobid (str), related words (str)

    keys = []
    simplified = {}
    simplified["value"] = []

    if match["type"] == "value":
        keys = ["quantity"]
    elif match["type"] == "interval":
        keys = ["quantityLeast", "quantityMost"]

    for key in keys:
        if key in match:
            if "parsedValue" in match[key]:
            elif "rawValue" in match[key]:
                return None

            simplified["unit"] = match[key]["rawUnit"]["name"] if "rawUnit" in match[key] else ""

    if len(simplified["value"]) == 1:
        simplified["value"] = simplified["value"][0]

    simplified["quantified"] = {}
    simplified["related"] = {}

    if "quantified" in match:

        if simplified["unit"] == "":
            simplified["unit"] = match["quantified"]["normalizedName"]

        simplified["quantified"][match["quantified"]["normalizedName"]] = []
        if "descriptors" in match["quantified"]:

            match["quantified"]["descriptors"].sort(key=lambda x: int(x["tokenIndex"]), reverse=False)

            for x in match["quantified"]["descriptors"]:

    if match["related"]:
        for r in match["related"]:
            simplified["related"][r["rawName"]] = []

            if "descriptors" in r:
                r["descriptors"].sort(key=lambda x: int(x["tokenIndex"]), reverse=False)

                for z in r["descriptors"]:

    return simplified

def _reconstruct_sent(parsed_sentence):
    """Reconstruct sentence from CoreNLP tokens - raw sentence text isn't retained by CoreNLP after sentence splitting and processing

        parsed_sentence (dict): Object containing CoreNLP output

        str: original sentence
    sent = ""
    for x in range(0, len(parsed_sentence["tokens"])):
        sent += parsed_sentence["tokens"][x]['originalText']
        if x + 1 != len(parsed_sentence["tokens"]):
            # Use character indices from tokens to ensure correct spacing when reconstructing
            num_spaces = parsed_sentence["tokens"][x + 1]["characterOffsetBegin"] - parsed_sentence["tokens"][x][
            for y in range(0, num_spaces):
                sent += " "
    return sent

def _sorted_dictionary(orig_dict, sort_list):
    od = OrderedDict()
    for item in sort_list:
        if item in orig_dict:
            od[item] = orig_dict[item]

    return od

# Top-Level function
def extract(content, corenlp_endpoint, grobid_endpoint, dependency_patterns_file, output_file=None, show_graph=False,
            pretty=False, simplify=False):
    """ Top-level user interface to parsing measurements and related words

        content (str): sentence or paragraph to be parsed (shouldn't be much larger)
        corenlp_endpoint (str): host + port of CoreNLP service (e.g. "http://localhost:9000")
        grobid_endpoint (str): host + port of grobid service (e.g. "http://localhost:8080")
        dependency_patterns (str): filepath to dependency patterns JSON file
        output_file (optional: str): file to write output to
        show_graph (bool): Will show network visualization of sentence dependencies if True
        pretty (bool): JSON output will be pretty printed if True, else one JSON doc per line (JSONL)
        simplify (bool): If True provides bare bones output with only extractions and not metadata about indices, types, etc.

        List of objects: containing parsed measurement info
        (optionally write to file)

    all_extractions = []

    out = None
    if output_file:
        out = codecs.open(output_file, "a", encoding="utf-8")

    if len(content) < 5:
        return None

    nlp = StanfordCoreNLP(corenlp_endpoint)
    output = nlp.annotate(content, properties={'outputFormat': 'json', 'timeout': '9999'})

    if isinstance(output, str):  # str supports both python 2 and 3
        output = json.loads(output.encode("latin-1"), strict=False)

    if "sentences" in output and isinstance(output["sentences"], list):
        for i in range(0, len(output["sentences"])):
            s_str = _reconstruct_sent(output["sentences"][i])

            # Enhanced dependencies have different key names in JSON depending on version of CoreNLP
            possible_keys = [

            dep_key = "collapsed-ccprocessed-dependencies"  # default key
            if "collapsed-ccprocessed-dependencies" not in output["sentences"][i]:
                for k in possible_keys:
                    if k in output["sentences"][i]:
                        dep_key = k

            global A
            A = Annotations(output["sentences"][i]["tokens"], output["sentences"][i][dep_key])

            if A.check_output(output["sentences"][i], stats) is True:

                stats.total_sentences += 1
                G = _build_graph(show=show_graph)
                grobid_response = grobid_quantities(s_str, A, grobid_endpoint)

                if isinstance(grobid_response, dict) and "measurements" in grobid_response:
                    for quantity in grobid_response["measurements"]:

                    stats.total_measurements += len(A.matches)

                    for idx, match in enumerate(A.matches):

                        global Num
                        Num = match["num"]

                        match["sentence"] = i + 1
                        match["grobid"]["related"] = _get_related(stats, match, dependency_patterns_file)

                        # Remove fields used for processing but not to be shown to user
                        remove = ["adverbs", "num", "unit", "connector", "form", "sentence", "num_idx", "unit_idx",
                        [match.pop(x, None) for x in remove]
                        sort_order = ['adverbs', 'type', 'quantity', 'quantityLeast', 'quantityMost', 'quantified',

                        match_ordered = _sorted_dictionary(match["grobid"], sort_order)

                        if simplify:
                            simplified_sort_order = ['value', 'unit', 'quantified', 'related']
                            simplified = _simplify_results(match_ordered)

                            if simplified:
                                match_ordered = _sorted_dictionary(match["grobid"], simplified_sort_order)

                        if pretty and not simplify:
                            if out:
                                out.write(json.dumps(match_ordered, ensure_ascii=False, indent=4))
                            if idx != len(A.matches) - 1 and out:

                        elif out:
                            out.write(json.dumps(match_ordered, ensure_ascii=False) + "\n")


                logging.warning("CoreNLP parsing failed for sentence: %s" % (s_str))
        logging.warning("CoreNLP parsing failed for content: %s" % (content))

    if out:

    logging.info("Total sentences parsed: %s" % (str(stats.total_sentences)))
    logging.info("Total measurements found: %s" % (str(stats.total_measurements)))

    return all_extractions

def grobid_quantities(sentence, a, endpoint):
    a = annotations
    """Pass sentence text to Grobid server on port 8080 for measurement parsing

        sentence (str): Sentence to be parsed
        a (Annotations object): object containing relevant CoreNLP output

        dict: object containing Grobid output

    # $ needs to be escaped when passed via subprocess
    sentence = re.sub("\$", "\\$", sentence)
    sentence = re.sub("\"", '\\"', sentence)
    sentence = re.sub("%", '%25', sentence)
    sentence = re.sub("`", "'", sentence)
    sentence = re.sub("'", '\\"', sentence)

    if endpoint[len(endpoint) - 1:] == "/":
        endpoint = endpoint[:len(endpoint) - 1]

    response = None
    # try:
    response = QuantitiesClient(endpoint).process_text(sentence)
    if response[0] != 200:
        print('No Grobid response for: %s' % sentence)
        logging.warning('No Grobid response for: %s' % sentence)
        return ""

    quantities = response[1]

    # Add token index for num, unit, quantified if available
    if isinstance(quantities, dict):
        for q in quantities["measurements"]:

            key = ""
            if q["type"] == "value":
                key = "quantity"
            # if Grobid doesn't parse interval correctly, sometimes only 'QuantityLeast' or 'QuantityMost' is available
            if q["type"] == "interval":
                if "quantityLeast" in q:
                    key = "quantityLeast"
                elif "QuantityMost" in q:
                    key = "quantityMost"
                    return {}

            if q["type"] == "listc":
                return {}

            if key == "":
                logging.error('Unknown Grobid key resulting from parse of: %s' % sentence)
                print("Unknown Grobid key resulting from parse of: %s" % sentence)

            # Grobid doesn't pick up negatives
            if sentence[sentence.find(q[key]["rawValue"]) - 1] == "-":
                q[key]["parsedValue"] = float("-" + str(q[key]["parsedValue"]))
                q[key]["rawValue"] = "-" + str(q[key]["rawValue"])
                q[key]["offsetStart"] -= 1

            if q[key]["offsetStart"] in a.tok_start:
                q[key]["tokenIndex"] = a.tok_start[q[key]["offsetStart"]]
                print("Not finding token index for Grobid Quantity value in CoreNLP output. Sentence: %s" % sentence)
                    "Not finding token index for Grobid Quantity value in CoreNLP output. Sentence: %s" % sentence)
                return {}

            if "rawUnit" in q[key]:
                q[key]["rawUnit"]["after"] = a.lookup[q[key]["tokenIndex"]]["after"]
                q[key]["rawUnit"]["tokenIndices"] = []

                if q[key]["rawUnit"]["offsetStart"] in a.tok_start:
                if q[key]["rawUnit"]["offsetEnd"] in a.tok_end:

                if q[key]["rawUnit"]["offsetStart"] == q[key]["offsetEnd"]:
                q[key]["rawUnit"]["tokenIndices"] = list(set(q[key]["rawUnit"]["tokenIndices"]))

            if "quantified" in q:

                # often times Grobid with return a phrase where normalized name is in middle. In this case, "offsetStart" identifies the wrong token
                add_to_offset = 0
                normalized_idx, words = None, None
                if " " in q["quantified"]["rawName"]:
                    words = q["quantified"]["rawName"].split(" ")
                    for i, w in enumerate(words):
                        if not q["quantified"]["normalizedName"] in w:
                            add_to_offset += (len(w) + 1)  # +1 for space that was split on

                q["quantified"]["offsetStart"] += add_to_offset

                if q["quantified"]["offsetStart"] in a.tok_start:
                    q["quantified"]["tokenIndex"] = a.tok_start[q["quantified"]["offsetStart"]]
                        "Not finding token index for Grobid quantified word in CoreNLP output. Sentence: %s" % (
                    # hyphen causing issue - Grobid doesn't treat hyphenated clause as one word
                    # example error sentence: "Macroscopic examination of the CNS revealed micrencephaly with a whole-brain weight of 84 grams."

    return quantities