python source code of geoname

"""
This script was generated by the train.py script in this repository:
https://github.com/ecohealthalliance/geoname-annotator-training
"""
import numpy as np
from numpy import array, int32


HIGH_CONFIDENCE_THRESHOLD = 0.5
GEONAME_SCORE_THRESHOLD = 0.13
base_classifier =\
{
    'penalty': 'l1',
    'dual': False,
    'tol': 0.0001,
    'C': 0.1,
    'fit_intercept': True,
    'intercept_scaling': 1,
    'class_weight': None,
    'random_state': None,
    'solver': 'liblinear',
    'max_iter': 100,
    'multi_class': 'auto',
    'verbose': 0,
    'warm_start': False,
    'n_jobs': None,
    'l1_ratio': None,
    'classes_': array([False,  True]),
    'coef_': array([[
        # log_population
        0.2833264880250032,
        # name_count
        0.496850674834566,
        # names_used
        0.6182820633848923,
        # exact_name_match
        1.2348109901150883,
        # multiple_spans
        1.0027275126871056,
        # span_length
        0.1556028330182338,
        # all_acronyms
        -1.723855098135833,
        # cannonical_name_used
        2.4448401433682614,
        # loc_NE_portion
        1.100324515118662,
        # other_NE_portion
        -0.006789540638115011,
        # noun_portion
        1.7846512186329173,
        # num_tokens
        0.4449857820783286,
        # med_token_prob
        -0.35487604034835174,
        # exact_alternatives
        -0.7357808282729202,
        # PPL_feature_code
        -3.313119817081297,
        # ADM_feature_code
        -3.8508572602598976,
        # PCL_feature_code
        0.002625732530454115,
        # other_feature_code
        -2.5725460514389598,
        # first_order
        0.8582848733971942,
        # combined_span
        0.7845557182369852,
        # close_locations
        0.0,
        # very_close_locations
        0.0,
        # base_score
        0.0,
        # base_score_margin
        0.0,
        # contained_locations
        0.0,
        # containing_locations
        0.0,
    ]]),
    'intercept_': array([-14.6902496]),
    'n_iter_': array([44], dtype=int32),
}

contextual_classifier =\
{
    'penalty': 'l1',
    'dual': False,
    'tol': 0.0001,
    'C': 0.1,
    'fit_intercept': True,
    'intercept_scaling': 1,
    'class_weight': None,
    'random_state': None,
    'solver': 'liblinear',
    'max_iter': 100,
    'multi_class': 'auto',
    'verbose': 0,
    'warm_start': False,
    'n_jobs': None,
    'l1_ratio': None,
    'classes_': array([False,  True]),
    'coef_': array([[
        # log_population
        0.3787203315925731,
        # name_count
        0.47246832816657763,
        # names_used
        1.0765607603242244,
        # exact_name_match
        1.4705218728593559,
        # multiple_spans
        1.3801379355279673,
        # span_length
        0.21060539648691756,
        # all_acronyms
        -2.5491642087123516,
        # cannonical_name_used
        2.877038521477874,
        # loc_NE_portion
        1.6424801016350434,
        # other_NE_portion
        -0.38562595379247006,
        # noun_portion
        2.002630501746275,
        # num_tokens
        0.4152636087418877,
        # med_token_prob
        -0.32906537630371446,
        # exact_alternatives
        -0.8984979885089859,
        # PPL_feature_code
        -4.534782739767053,
        # ADM_feature_code
        -5.510120071836727,
        # PCL_feature_code
        0.003626341435930206,
        # other_feature_code
        -3.2323872260723783,
        # first_order
        1.451007214897749,
        # combined_span
        1.480736480667228,
        # close_locations
        0.1719448315096514,
        # very_close_locations
        -0.07886456859233734,
        # base_score
        -5.393131594700634,
        # base_score_margin
        3.119070808829057,
        # contained_locations
        0.1825358655341298,
        # containing_locations
        0.962090412821035,
    ]]),
    'intercept_': array([-14.57426316]),
    'n_iter_': array([52], dtype=int32),
}

# Logistic regression code from scipy
def predict_proba(X, classifier):
    """Probability estimation for OvR logistic regression.
    Positive class probabilities are computed as
    1. / (1. + np.exp(-classifier.decision_function(X)));
    multiclass is handled by normalizing that over all classes.
    """
    prob = np.dot(X, classifier['coef_'].T) + classifier['intercept_']
    prob = prob.ravel() if prob.shape[1] == 1 else prob
    prob *= -1
    np.exp(prob, prob)
    prob += 1
    np.reciprocal(prob, prob)
    if prob.ndim == 1:
        return np.vstack([1 - prob, prob]).T
    else:
        # OvR normalization, like LibLinear's predict_probability
        prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
        return prob


def predict_proba_base(X):
    return predict_proba(X, base_classifier)


def predict_proba_contextual(X):
    return predict_proba(X, contextual_classifier)