python source code of preprocessing

# -*- coding: utf-8 -*-
import codecs
import cPickle
from collections import Counter
import matplotlib.pyplot as plt
import spacy
import numpy as np
import sqlite3
from geopy.distance import great_circle
from matplotlib import pyplot, colors


# -------- GLOBAL CONSTANTS AND VARIABLES -------- #
BATCH_SIZE = 64
CONTEXT_LENGTH = 200  # each side of target entity
UNKNOWN = u"<unknown>"
EMBEDDING_DIMENSION = 50
TARGET_LENGTH = 15
ENCODING_MAP_1x1 = cPickle.load(open(u"data/1x1_encode_map.pkl"))      # We need these maps
ENCODING_MAP_2x2 = cPickle.load(open(u"data/2x2_encode_map.pkl"))      # and the reverse ones
REVERSE_MAP_1x1 = cPickle.load(open(u"data/1x1_reverse_map.pkl"))      # to handle the used and
REVERSE_MAP_2x2 = cPickle.load(open(u"data/2x2_reverse_map.pkl"))      # unused map_vector polygons.
OUTLIERS_MAP_1x1 = cPickle.load(open(u"data/1x1_outliers_map.pkl"))    # Outliers are redundant polygons that
OUTLIERS_MAP_2x2 = cPickle.load(open(u"data/2x2_outliers_map.pkl"))    # have been removed but must also be handled.
# -------- GLOBAL CONSTANTS AND VARIABLES -------- #


def print_stats(accuracy):
    """
    Prints Mean, Median, AUC and acc@161km for the list.
    :param accuracy: a list of geocoding errors
    """
    print("==============================================================================================")
    print(u"Median error:", np.median(sorted(accuracy)))
    print(u"Mean error:", np.mean(accuracy))
    accuracy = np.log(np.array(accuracy) + 1)
    k = np.log(161)
    print u"Accuracy to 161 km: ", sum([1.0 for dist in accuracy if dist < k]) / len(accuracy)
    print u"AUC = ", np.trapz(accuracy) / (np.log(20039) * (len(accuracy) - 1))  # Trapezoidal rule.
    print("==============================================================================================")


def pad_list(size, a_list, from_left, padding):
    """
    Utility function that pads a list with any given padding.
    :param size: the final length of the list i.e. pad up to size
    :param a_list: the list to pad
    :param from_left: True to pad from the left, False to pad from the right
    :param padding: whatever you want to use for padding, example "0"
    :return: the padded list
    """
    while len(a_list) < size:
        if from_left:
            a_list = [padding] + a_list
        else:
            a_list += [padding]
    return a_list


def coord_to_index(coordinates, polygon_size):
    """
    Convert coordinates into an array (world representation) index. Use that to modify map_vector polygon value.
    :param coordinates: (latitude, longitude) to convert to the map vector index
    :param polygon_size: integer size of the polygon? i.e. the resolution of the world
    :return: index pointing into map_vector array
    """
    latitude = float(coordinates[0]) - 90 if float(coordinates[0]) != -90 else -179.99  # The two edge cases must
    longitude = float(coordinates[1]) + 180 if float(coordinates[1]) != 180 else 359.99  # get handled differently!
    if longitude < 0:
        longitude = -longitude
    if latitude < 0:
        latitude = -latitude
    x = int(360 / polygon_size) * int(latitude / polygon_size)
    y = int(longitude / polygon_size)
    return x + y if 0 <= x + y <= int(360 / polygon_size) * int(180 / polygon_size) else Exception(u"Shock horror!!")


def index_to_coord(index, polygon_size):
    """
    Convert index (output of the prediction model) back to coordinates.
    :param index: of the polygon/tile in map_vector array (given by model prediction)
    :param polygon_size: size of each polygon/tile i.e. resolution of the world
    :return: pair of (latitude, longitude)
    """
    x = int(index / (360 / polygon_size))
    y = index % int(360 / polygon_size)
    if x > int(90 / polygon_size):
        x = -int((x - (90 / polygon_size)) * polygon_size)
    else:
        x = int(((90 / polygon_size) - x) * polygon_size)
    if y < int(180 / polygon_size):
        y = -int(((180 / polygon_size) - y) * polygon_size)
    else:
        y = int((y - (180 / polygon_size)) * polygon_size)
    return x, y


def get_coordinates(con, loc_name):
    """
    Access the database to retrieve coordinates and other data from DB.
    :param con: sqlite3 database cursor i.e. DB connection
    :param loc_name: name of the place
    :return: a list of tuples [(latitude, longitude, population, feature_code), ...]
    """
    result = con.execute(u"SELECT METADATA FROM GEO WHERE NAME = ?", (loc_name.lower(),)).fetchone()
    if result:
        result = eval(result[0])  # Do not remove the sorting, the function below assumes sorted results!
        return sorted(result, key=lambda (a, b, c, d): c, reverse=True)
    else:
        return []


def construct_map_vector(a_list, polygon_size, mapping, outliers):
    """
    Build the map_vector vector representation from a_list of location data.
    :param a_list: of tuples [(latitude, longitude, population, feature_code), ...]
    :param polygon_size: what's the resolution? size of each polygon in degrees.
    :param mapping: one of the transformation maps 1x1 or 2x2
    :param outliers: the outlier map, 1x1 or 2x2 (must match resolution or mapping above)
    :return: map_vector representation
    """
    map_vector = np.zeros(len(mapping), )
    if len(a_list) == 0:
        return map_vector
    max_pop = a_list[0][2] if a_list[0][2] > 0 else 1
    for s in a_list:
        index = coord_to_index((s[0], s[1]), polygon_size)
        if index in mapping:
            index = mapping[index]
        else:
            index = mapping[outliers[index]]
        map_vector[index] += float(max(s[2], 1)) / max_pop
    return map_vector / map_vector.max() if map_vector.max() > 0.0 else map_vector


def construct_map_vector_full_scale(a_list, polygon_size):
    """
    This function is similar to the above BUT it builds map_vector WITHOUT removing redundant polygons.
    :param a_list: of tuples [(latitude, longitude, population, feature_code), ...]
    :param polygon_size: size of each polygon in degrees i.e 1x1 or 2x2
    :return: map_vector (full scale) i.e. without removing redundant polygons, used for visualisation in 2D
    """
    map_vector = np.zeros(int(360 / polygon_size) * int(180 / polygon_size))
    if len(a_list) == 0:
        return map_vector
    max_pop = a_list[0][2] if a_list[0][2] > 0 else 1
    for s in a_list:
        index = coord_to_index((s[0], s[1]), polygon_size)
        map_vector[index] += float(max(s[2], 1)) / max_pop
    return map_vector / map_vector.max() if map_vector.max() > 0.0 else map_vector


def merge_lists(lists):
    """
    Utility function to merge multiple lists.
    :param lists: a list of lists to be merged
    :return: one single list with all items from above list of lists
    """
    out = []
    for l in lists:
        out.extend(l)
    return out


def populate_sql():
    """
    Create and populate the sqlite3 database with GeoNames data. Requires Geonames dump.
    No need to run this function, I share the database as a separate dump on GitHub (see link).
    """
    geo_names = {}
    p_map = {"PPLC": 100000, "PCLI": 100000, "PCL": 100000, "PCLS": 10000, "PCLF": 10000, "CONT": 100000, "RGN": 100000}

    for line in codecs.open(u"../data/allCountries.txt", u"r", encoding=u"utf-8"):
        line = line.split("\t")
        feat_code = line[7]
        class_code = line[6]
        pop = int(line[14])
        for name in [line[1], line[2]] + line[3].split(","):
            name = name.lower()
            if len(name) != 0:
                if name in geo_names:
                    already_have_entry = False
                    for item in geo_names[name]:
                        if great_circle((float(line[4]), float(line[5])), (item[0], item[1])).km < 100:
                            if item[2] >= pop:
                                already_have_entry = True
                    if not already_have_entry:
                        pop = get_population(class_code, feat_code, p_map, pop)
                        geo_names[name].add((float(line[4]), float(line[5]), pop, feat_code))
                else:
                    pop = get_population(class_code, feat_code, p_map, pop)
                    geo_names[name] = {(float(line[4]), float(line[5]), pop, feat_code)}

    conn = sqlite3.connect(u'../data/geonames.db')
    c = conn.cursor()
    # c.execute("CREATE TABLE GEO (NAME VARCHAR(100) PRIMARY KEY NOT NULL, METADATA VARCHAR(5000) NOT NULL);")
    c.execute(u"DELETE FROM GEO")  # alternatively, delete the database file.
    conn.commit()

    for gn in geo_names:
        c.execute(u"INSERT INTO GEO VALUES (?, ?)", (gn, str(list(geo_names[gn]))))
    print(u"Entries saved:", len(geo_names))
    conn.commit()
    conn.close()


def get_population(class_code, feat_code, p_map, pop):
    """
    Utility function to eliminate code duplication. Nothing of much interest, methinks.
    :param class_code: Geonames code for the class of location
    :param feat_code: Geonames code for the feature type of an database entry
    :param p_map: dictionary mapping feature codes to estimated population
    :param pop: population count
    :return: population (modified if class code is one of A, P or L.
    """
    if pop == 0 and class_code in ["A", "P", "L"]:
        pop = p_map.get(feat_code, 0)
    return pop


def generate_training_data():
    """
    Prepare Wikipedia training data. Please download the required files from GitHub.
    Files: geonames.db and geowiki.txt both inside the data folder (see README)
    Alternatively, create your own with http://medialab.di.unipi.it/wiki/Wikipedia_Extractor
    """
    conn = sqlite3.connect(u'../data/geonames.db')
    c = conn.cursor()
    nlp = spacy.load(u'en')  # or spacy.load(u'en_core_web_lg') depending on your Spacy Download (simple, full)
    padding = nlp(u"0")[0]
    inp = codecs.open(u"../data/geowiki.txt", u"r", encoding=u"utf-8")
    o = codecs.open(u"../data/train_wiki.txt", u"w", encoding=u"utf-8")
    lat, lon = u"", u""
    target, string = u"", u""
    skipped = 0

    for line in inp:
        if len(line.strip()) == 0:
            continue
        limit = 0
        if line.startswith(u"NEW ARTICLE::"):
            if len(string.strip()) > 0 and len(target) != 0:
                locations_near, locations_far = [], []
                doc = nlp(string)
                for d in doc:
                    if d.text == target[0]:
                        if u" ".join(target) == u" ".join([t.text for t in doc[d.i:d.i + len(target)]]):
                            near_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH / 2):d.i]], True, padding) \
                                       + pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target): d.i + len(target) + CONTEXT_LENGTH / 2]], False, padding)
                            far_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH):max(0, d.i - CONTEXT_LENGTH / 2)]], True, padding) \
                                      + pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target) + CONTEXT_LENGTH / 2: d.i + len(target) + CONTEXT_LENGTH]], False, padding)
                            near_out, far_out = [], []
                            location = u""
                            for (out_list, in_list, is_near) in [(near_out, near_inp, True), (far_out, far_inp, False)]:
                                for index, item in enumerate(in_list):
                                    if item.ent_type_ in [u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"]:
                                        if item.ent_iob_ == u"B" and item.text.lower() == u"the":
                                            out_list.append(item.text.lower())
                                        else:
                                            location += item.text + u" "
                                            out_list.append(u"**LOC**" + item.text.lower())
                                    elif item.ent_type_ in [u"PERSON", u"DATE", u"TIME", u"PERCENT", u"MONEY",
                                                            u"QUANTITY", u"CARDINAL", u"ORDINAL"]:
                                        out_list.append(u'0')
                                    elif item.is_punct:
                                        out_list.append(u'0')
                                    elif item.is_digit or item.like_num:
                                        out_list.append(u'0')
                                    elif item.like_email:
                                        out_list.append(u'0')
                                    elif item.like_url:
                                        out_list.append(u'0')
                                    elif item.is_stop:
                                        out_list.append(u'0')
                                    else:
                                        out_list.append(item.lemma_)
                                    if location.strip() != u"" and (item.ent_type == 0 or index == len(in_list) - 1):
                                        location = location.strip()
                                        coords = get_coordinates(c, location)
                                        if len(coords) > 0 and location != u" ".join(target):
                                            if is_near:
                                                locations_near.append(coords)
                                            else:
                                                locations_far.append(coords)
                                        else:
                                            offset = 1 if index == len(in_list) - 1 else 0
                                            for i in range(index - len(location.split()), index):
                                                out_list[i + offset] = in_list[i + offset].lemma_ \
                                                if in_list[i + offset].is_alpha and location != u" ".join(target) else u'0'
                                        location = u""
                            target_grid = get_coordinates(c, u" ".join(target))
                            if len(target_grid) == 0:
                                skipped += 1
                                break
                            entities_near = merge_lists(locations_near)
                            entities_far = merge_lists(locations_far)
                            locations_near, locations_far = [], []
                            o.write(lat + u"\t" + lon + u"\t" + str(near_out) + u"\t" + str(far_out) + u"\t")
                            o.write(str(target_grid) + u"\t" + str([t.lower() for t in target][:TARGET_LENGTH]))
                            o.write(u"\t" + str(entities_near) + u"\t" + str(entities_far) + u"\n")
                            limit += 1
                            if limit > 29:
                                break
            line = line.strip().split("\t")
            if u"(" in line[1]:
                line[1] = line[1].split(u"(")[0].strip()
            if line[1].strip().startswith(u"Geography of "):
                target = line[1].replace(u"Geography of ", u"").split()
            elif u"," in line[1]:
                target = line[1].split(u",")[0].strip().split()
            else:
                target = line[1].split()
            lat = line[2]
            lon = line[3]
            string = ""
            print(u"Processed", limit, u"Skipped:", skipped, u"Name:", u" ".join(target))
        else:
            string += line
    o.close()


def generate_evaluation_data(corpus, file_name):
    """
    Create evaluation data from text files. See README for formatting and download instructions.
    :param corpus: name of the dataset such as LGL, GEOVIRUS or WIKTOR
    :param file_name: an affix, in case you're creating several versions of the same dataset
    """
    conn = sqlite3.connect(u'../data/geonames.db')
    c = conn.cursor()
    nlp = spacy.load(u'en')  # or spacy.load(u'en_core_web_lg'), it depends on your choice of model
    padding = nlp(u"0")[0]
    directory = u"../data/" + corpus + u"/"
    o = codecs.open(u"data/eval_" + corpus + file_name + u".txt", u"w", encoding=u"utf-8")
    line_no = 0 if corpus == u"lgl" else -1

    for line in codecs.open(u"data/" + corpus + file_name + u".txt", u"r", encoding=u"utf-8"):
        line_no += 1
        if len(line.strip()) == 0:
            continue
        for toponym in line.split(u"||")[:-1]:
            captured = False
            doc = nlp(codecs.open(directory + str(line_no), u"r", encoding=u"utf-8").read())
            locations_near, locations_far = [], []
            toponym = toponym.split(u",,")
            target = [t.text for t in nlp(toponym[1])]
            ent_length = len(u" ".join(target))
            lat, lon = toponym[2], toponym[3]
            start, end = int(toponym[4]), int(toponym[5])
            for d in doc:
                if d.text == target[0]:
                    if u" ".join(target) == u" ".join([t.text for t in doc[d.i:d.i + len(target)]]):
                        if abs(d.idx - start) > 4 or abs(d.idx + ent_length - end) > 4:
                            continue
                        captured = True
                        near_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH / 2):d.i]], True, padding) \
                                 + pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target): d.i + len(target) + CONTEXT_LENGTH / 2]], False, padding)
                        far_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH):max(0, d.i - CONTEXT_LENGTH / 2)]], True, padding) \
                                + pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target) + CONTEXT_LENGTH / 2: d.i + len(target) + CONTEXT_LENGTH]], False, padding)
                        near_out, far_out = [], []
                        location = u""
                        for (out_list, in_list, is_near) in [(near_out, near_inp, True), (far_out, far_inp, False)]:
                            for index, item in enumerate(in_list):
                                if item.ent_type_ in [u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"]:
                                    if item.ent_iob_ == u"B" and item.text.lower() == u"the":
                                        out_list.append(item.text.lower())
                                    else:
                                        location += item.text + u" "
                                        out_list.append(u"**LOC**" + item.text.lower())
                                elif item.ent_type_ in [u"PERSON", u"DATE", u"TIME", u"PERCENT", u"MONEY",
                                                        u"QUANTITY", u"CARDINAL", u"ORDINAL"]:
                                    out_list.append(u'0')
                                elif item.is_punct:
                                    out_list.append(u'0')
                                elif item.is_digit or item.like_num:
                                    out_list.append(u'0')
                                elif item.like_email:
                                    out_list.append(u'0')
                                elif item.like_url:
                                    out_list.append(u'0')
                                elif item.is_stop:
                                    out_list.append(u'0')
                                else:
                                    out_list.append(item.lemma_)
                                if location.strip() != u"" and (item.ent_type == 0 or index == len(in_list) - 1):
                                    location = location.strip()
                                    coords = get_coordinates(c, location)
                                    if len(coords) > 0 and location != u" ".join(target):
                                        if is_near:
                                            locations_near.append(coords)
                                        else:
                                            locations_far.append(coords)
                                    else:
                                        offset = 1 if index == len(in_list) - 1 else 0
                                        for i in range(index - len(location.split()), index):
                                            out_list[i + offset] = in_list[i + offset].lemma_ \
                                                if in_list[i + offset].is_alpha and location != u" ".join(target) else u'0'
                                    location = u""

                        lookup = toponym[0] if corpus != u"wiki" else toponym[1]
                        target_grid = get_coordinates(c, lookup)
                        if len(target_grid) == 0:
                            raise Exception(u"No entry in the database!", lookup)
                        entities_near = merge_lists(locations_near)
                        entities_far = merge_lists(locations_far)
                        locations_near, locations_far = [], []
                        o.write(lat + u"\t" + lon + u"\t" + str(near_out) + u"\t" + str(far_out) + u"\t")
                        o.write(str(target_grid) + u"\t" + str([t.lower() for t in lookup.split()][:TARGET_LENGTH]))
                        o.write(u"\t" + str(entities_near) + u"\t" + str(entities_far) + u"\n")
            if not captured:
                print line_no, line, target, start, end
    o.close()


def visualise_2D_grid(x, title, log=False):
    """
    Display 2D array data with a title. Optional: log for better visualisation of small values.
    :param x: 2D numpy array you want to visualise
    :param title: of the chart because it's nice to have one :-)
    :param log: True in order to log the values and make for better visualisation, False for raw numbers
    """
    if log:
        x = np.log10(x)
    cmap = colors.LinearSegmentedColormap.from_list('my_colormap', ['lightgrey', 'darkgrey', 'dimgrey', 'black'])
    cmap.set_bad(color='white')
    img = pyplot.imshow(x, cmap=cmap, interpolation='nearest')
    pyplot.colorbar(img, cmap=cmap)
    plt.title(title)
    # plt.savefig(title + u".png", dpi=200, transparent=True)  # Uncomment to save to file
    plt.show()


def generate_vocabulary(path, min_words, min_entities):
    """
    Prepare the vocabulary for training/testing. This function is to be called on generated data only, not plain text.
    :param path: to the file from which to build
    :param min_words: occurrence for inclusion in the vocabulary
    :param min_entities: occurrence for inclusion in the vocabulary
    """
    vocab_words, vocab_locations = {UNKNOWN, u'0'}, {UNKNOWN, u'0'}
    words, locations = [], []
    for f in [path]:  # You can also build the vocabulary from several files, just add to the list.
        training_file = codecs.open(f, u"r", encoding=u"utf-8")
        for line in training_file:
            line = line.strip().split("\t")
            words.extend([w for w in eval(line[2]) if u"**LOC**" not in w])  # NEAR WORDS
            words.extend([w for w in eval(line[3]) if u"**LOC**" not in w])  # FAR WORDS
            locations.extend([w for w in eval(line[2]) if u"**LOC**" in w])  # NEAR ENTITIES
            locations.extend([w for w in eval(line[3]) if u"**LOC**" in w])  # FAR ENTITIES

    words = Counter(words)
    for word in words:
        if words[word] > min_words:
            vocab_words.add(word)
    print(u"Words saved:", len(vocab_words))

    locations = Counter(locations)
    for location in locations:
        if locations[location] > min_entities:
            vocab_locations.add(location.replace(u"**LOC**", u""))
    print(u"Locations saved:", len(vocab_locations))

    vocabulary = vocab_words.union(vocab_locations)
    word_to_index = dict([(w, i) for i, w in enumerate(vocabulary)])
    cPickle.dump(word_to_index, open(u"data/words2index.pkl", "w"))


def generate_arrays_from_file(path, words_to_index, train=True):
    """
    Generator function for the FULL (SOTA) CNN + map_vector model in the paper. Uses all available data inputs.
    :param path: to the training file (see training data generation functions)
    :param words_to_index: the vocabulary set
    :param train: True is generating training data, false for test data
    """
    while True:
        training_file = codecs.open(path, "r", encoding="utf-8")
        counter = 0
        context_words, entities_strings, labels = [], [], []
        map_vector, target_string = [], []
        for line in training_file:
            counter += 1
            line = line.strip().split("\t")
            labels.append(construct_map_vector([(float(line[0]), float(line[1]), 0)], 2, ENCODING_MAP_2x2, OUTLIERS_MAP_2x2))

            near = [w if u"**LOC**" not in w else u'0' for w in eval(line[2])]
            far = [w if u"**LOC**" not in w else u'0' for w in eval(line[3])]
            context_words.append(far[:CONTEXT_LENGTH / 2] + near + far[CONTEXT_LENGTH / 2:])

            near = [w.replace(u"**LOC**", u"") if u"**LOC**" in w else u'0' for w in eval(line[2])]
            far = [w.replace(u"**LOC**", u"") if u"**LOC**" in w else u'0' for w in eval(line[3])]
            entities_strings.append(far[:CONTEXT_LENGTH / 2] + near + far[CONTEXT_LENGTH / 2:])

            # map_vector.append(construct_map_vector(sorted(eval(line[4]) + eval(line[6]) + eval(line[7]),
            #                key=lambda (a, b, c, d): c, reverse=True), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))
            # paper version above versus small experimental setup below, map_vector is fully modular, remember? Try both!
            map_vector.append(construct_map_vector(eval(line[4]) + eval(line[6]) + eval(line[7]), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))
            target_string.append(pad_list(TARGET_LENGTH, eval(line[5]), True, u'0'))

            if counter % BATCH_SIZE == 0:
                for collection in [context_words, entities_strings, target_string]:
                    for x in collection:
                        for i, w in enumerate(x):
                            if w in words_to_index:
                                x[i] = words_to_index[w]
                            else:
                                x[i] = words_to_index[UNKNOWN]
                if train:
                    yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
                            np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
                else:
                    yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
                            np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)])

                context_words, entities_strings, labels = [], [], []
                map_vector, target_string = [], []

        if len(labels) > 0:  # This block is only ever entered at the end to yield the final few samples. (< BATCH_SIZE)
            for collection in [context_words, entities_strings, target_string]:
                for x in collection:
                    for i, w in enumerate(x):
                        if w in words_to_index:
                            x[i] = words_to_index[w]
                        else:
                            x[i] = words_to_index[UNKNOWN]
            if train:
                yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
                        np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
            else:
                yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
                        np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)])


def generate_arrays_from_file_lstm(path, words_to_index, train=True):
    """
    Generator for the context2vec model. Uses only lexical features.
    To replicate the map_vector + CONTEXT2VEC model from the paper, uncomment a few sections below
    and in the context2vec.py file. I hope it's clear enough :-) Email me if it isn't!
    :param path: to the training file (see training data generation functions)
    :param words_to_index: the vocabulary set
    :param train: True for training stage, False for testing stage
    """
    while True:
        training_file = codecs.open(path, "r", encoding="utf-8")
        counter = 0
        left, right, map_vector = [], [], []
        target_string, labels = [], []
        for line in training_file:
            counter += 1
            line = line.strip().split("\t")
            labels.append(construct_map_vector([(float(line[0]), float(line[1]), 0)], 2, ENCODING_MAP_2x2, OUTLIERS_MAP_2x2))

            near = [w.replace(u"**LOC**", u"") for w in eval(line[2])]
            far = [w.replace(u"**LOC**", u"") for w in eval(line[3])]
            left.append(far[:CONTEXT_LENGTH / 2] + near[:CONTEXT_LENGTH / 2])
            right.append(near[CONTEXT_LENGTH / 2:] + far[CONTEXT_LENGTH / 2:])

            target_string.append(pad_list(TARGET_LENGTH, eval(line[5]), True, u'0'))

            # map_vector.append(construct_map_vector(eval(line[4]) + eval(line[6]) + eval(line[7]), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))

            if counter % BATCH_SIZE == 0:
                for collection in [left, right, target_string]:
                    for x in collection:
                        for i, w in enumerate(x):
                            if w in words_to_index:
                                x[i] = words_to_index[w]
                            else:
                                x[i] = words_to_index[UNKNOWN]
                if train:
                    yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)], np.asarray(labels))
                    # yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
                else:
                    yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)])
                    # yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)])

                left, right, map_vector = [], [], []
                target_string, labels = [], []

        if len(labels) > 0:  # This block is only ever entered at the end to yield the final few samples. (< BATCH_SIZE)
            for collection in [left, right, target_string]:
                for x in collection:
                    for i, w in enumerate(x):
                        if w in words_to_index:
                            x[i] = words_to_index[w]
                        else:
                            x[i] = words_to_index[UNKNOWN]
            if train:
                yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)], np.asarray(labels))
                # yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
            else:
                yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)])
                # yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)])


def generate_strings_from_file(path):
    """
    Generator of labels, location names and context. Used for training and testing.
    :param path: to the training file (see training data generation functions)
    :return: Yields a list of tuples [(label, location name, context), ...]
    """
    while True:
        for line in codecs.open(path, "r", encoding="utf-8"):
            line = line.strip().split("\t")
            context = u" ".join(eval(line[2])) + u"*E*" + u" ".join(eval(line[5])) + u"*E*" + u" ".join(eval(line[3]))
            yield ((float(line[0]), float(line[1])), u" ".join(eval(line[5])).strip(), context)


def generate_arrays_from_file_map_vector(path, train=True, looping=True):
    """
    Generator for the plain map_vector model, works for MLP, Naive Bayes or Random Forest. Table 2 in the paper.
    :param path: to the training file (see training data generation functions)
    :param train: True for training phase, False for testing phase
    :param looping: True for continuous generation, False for one iteration.
    """
    while True:
        training_file = codecs.open(path, "r", encoding="utf-8")
        counter = 0
        labels, target_coord = [], []
        for line in training_file:
            counter += 1
            line = line.strip().split("\t")
            labels.append(construct_map_vector([(float(line[0]), float(line[1]), 0, u'')], 2, ENCODING_MAP_2x2, OUTLIERS_MAP_2x2))
            target_coord.append(construct_map_vector(eval(line[4]) + eval(line[6]) + eval(line[7]), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))

            if counter % BATCH_SIZE == 0:
                if train:
                    yield ([np.asarray(target_coord)], np.asarray(labels))
                else:
                    yield ([np.asarray(target_coord)])

                labels = []
                target_coord = []

        if len(labels) > 0:
            # This block is only ever entered at the end to yield the final few samples. (< BATCH_SIZE)
            if train:
                yield ([np.asarray(target_coord)], np.asarray(labels))
            else:
                yield ([np.asarray(target_coord)])
        if not looping:
            break


def shrink_map_vector(polygon_size):
    """
    Remove polygons that only cover oceans. Dumps a dictionary of DB entries.
    :param polygon_size: the size of each polygon such as 1x1 or 2x2 or 3x3 degrees (integer)
    """
    map_vector = np.zeros((180 / polygon_size) * (360 / polygon_size),)
    for line in codecs.open(u"../data/allCountries.txt", u"r", encoding=u"utf-8"):
        line = line.split("\t")
        lat, lon = float(line[4]), float(line[5])
        index = coord_to_index((lat, lon), polygon_size=polygon_size)
        map_vector[index] += 1.0
    cPickle.dump(map_vector, open(u"mapvec_shrink.pkl", "w"))


def oracle(path):
    """
    Calculate the Oracle (best possible given your database) performance for a given dataset.
    Prints the Oracle scores including mean, median, AUC and acc@161.
    :param path: file path to evaluate
    """
    final_errors = []
    conn = sqlite3.connect(u'../data/geonames.db')
    for line in codecs.open(path, "r", encoding="utf-8"):
        line = line.strip().split("\t")
        coordinates = (float(line[0]), float(line[1]))
        best_candidate = []
        for candidate in get_coordinates(conn.cursor(), u" ".join(eval(line[5])).strip()):
            best_candidate.append(great_circle(coordinates, (float(candidate[0]), float(candidate[1]))).km)
        final_errors.append(sorted(best_candidate)[0])
    print_stats(final_errors)


# --------------------------------------------- INVOKE FUNCTIONS ---------------------------------------------------
# prepare_geocorpora()
# print get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"dublin")
# generate_training_data()
# generate_evaluation_data(corpus="geovirus", file_name="")
# generate_vocabulary(path=u"../data/train_wiki.txt", min_words=9, min_entities=1)
# shrink_map_vector(2)
# oracle(u"data/eval_geovirus_gold.txt")
# conn = sqlite3.connect('../data/geonames.db')
# c = conn.cursor()
# c.execute("INSERT INTO GEO VALUES (?, ?)", (u"darfur", u"[(13.5, 23.5, 0), (44.05135, -94.83804, 106)]"))
# c.execute("DELETE FROM GEO WHERE name = 'darfur'")
# conn.commit()
# print index_to_coord(8177, 2)
# populate_sql()

# -------- CREATE MAPS (mapping from 64,000/16,200 polygons to 23,002, 7,821) ------------
# map_vector = list(cPickle.load(open(u"data/1x1_geonames.pkl")))
# zeros = dict([(i, v) for i, v in enumerate(map_vector) if v > 0])  # isolate the non zero values
# zeros = dict([(i, v) for i, v in enumerate(zeros)])                # replace counts with indices
# zeros = dict([(v, i) for (i, v) in zeros.iteritems()])             # reverse keys and values
# cPickle.dump(zeros, open(u"data/1x1_encode_map.pkl", "w"))

# ------- VISUALISE THE WHOLE DATABASE ----------
# map_vector = np.reshape(map_vector, newshape=((180 / 1), (360 / 1)))
# visualise_2D_grid(map_vector, "Geonames Database", True)

# -------- CREATE OUTLIERS (polygons outside of map_vector) MAP --------
# filtered = [i for i, v in enumerate(map_vector) if v > 0]
# the_rest = [i for i, v in enumerate(map_vector) if v == 0]
# poly_size = 2
# dict_rest = dict()
#
# for poly_rest in the_rest:
#     best_index = 100000
#     best_dist = 100000
#     for poly_filtered in filtered:
#         dist = great_circle(index_to_coord(poly_rest, poly_size), index_to_coord(poly_filtered, poly_size)).km
#         if dist < best_dist:
#             best_index = poly_filtered
#             best_dist = dist
#     dict_rest[poly_rest] = best_index
#
# cPickle.dump(dict_rest, open(u"data/2x2_outliers_map.pkl", "w"))

# ------ PROFILING SETUP -----------
# import cProfile, pstats, StringIO
# pr = cProfile.Profile()
# pr.enable()
# CODE HERE
# pr.disable()
# s = StringIO.StringIO()
# sortby = 'cumulative'
# ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
# ps.print_stats()
# print s.getvalue()

# ----------- VISUALISATION OF DIFFERENT LOCATIONS -------------
# print len(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Melbourne"))
# coord = get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Giza")
# print coord
# coord.extend(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Giza Plateau"))
# coord.extend(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Cairo"))
# coord.extend(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Egypt"))
# coord = sorted(coord, key=lambda (a, b, c, d): c, reverse=True)
# x = construct_map_vector_full_scale(coord, polygon_size=2)
# x = np.reshape(x, newshape=((180 / 2), (360 / 2)))
# visualise_2D_grid(x, "Giza, Giza Plateau, Egypt, Cairo", True)

# ---------- DUMP DATABASE ------
# import sqlite3
#
# con = sqlite3.connect('../data/geonames.db')
# with codecs.open('dump.sql', 'w', 'utf-8') as f:
#     for line in con.iterdump():
#         f.write('%s\n' % line)
# -------------------------------