python source code of helper

#!/usr/bin/env python
"""
 AUTHOR: Gabriel Bassett
 DATE: <01-23-2015>
 DEPENDENCIES: <a list of modules requiring installation>
 Copyright 2015 Gabriel Bassett

 LICENSE:
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.

 DESCRIPTION:
 <A description of the software>

 NOTES:
 <No Notes>

 ISSUES:
 <No Issues>

 TODO:
 <No TODO>

"""
# PRE-USER SETUP
import logging

########### NOT USER EDITABLE ABOVE THIS POINT #################


# USER VARIABLES
CONFIG_FILE = ""
LOGLEVEL = logging.DEBUG
LOG = None

########### NOT USER EDITABLE BELOW THIS POINT #################


## IMPORTS

import argparse
import ConfigParser
import networkx as nx
import urlparse
import numpy as np
from scipy import stats  # for percentile

## SETUP
__author__ = "Gabriel Bassett"

if __name__ == "__main__":
    # Parse Arguments (should correspond to user variables)
    parser = argparse.ArgumentParser(description='This script processes a graph.')
    parser.add_argument('-d', '--debug',
                        help='Print lots of debugging statements',
                        action="store_const", dest="loglevel", const=logging.DEBUG,
                        default=LOGLEVEL
                       )
    parser.add_argument('-v', '--verbose',
                        help='Be verbose',
                        action="store_const", dest="loglevel", const=logging.INFO
                       )
    parser.add_argument('--log', help='Location of log file', default=LOG)
    args = parser.parse_args()

# add config arguments
if __name__ == "__main__":
    CONFIG_FILE = args.config
try:
  config = ConfigParser.SafeConfigParser()
  config.readfp(open(CONFIG_FILE))
  config_exists = True
except:
  config_exists = False
if config_exists:
    if config.has_section('LOGGING'):
        if 'level' in config.options('LOGGING'):
            level = config.get('LOGGING', 'level')
            if level == 'debug':
                loglevel = logging.DEBUG
            elif level == 'verbose':
                loglevel = logging.INFO
            else:
                loglevel = logging.WARNING
        else:
            loglevel = logging.WARNING
        if 'log' in config.options('LOGGING'):
            log = config.get('LOGGING', 'log')
        else:
            log = None


## Set up Logging
if __name__ == "__main__":
    if args.log is not None:
        logging.basicConfig(filename=args.log, level=args.loglevel)
    else:
        logging.basicConfig(level=args.loglevel)
# <add other setup here>


## GLOBAL EXECUTION
pass


## FUNCTION DEFINITION
def create_topic(properties, prefix=""):
    """

    :param properties: A dictionary of properties
    :param prefix: If nodes are stored with a pref
    :return: A topic graph in networkx format with one node per property

    NOTE: If multiple values of a certain type, (e.g. multiple IPs) make the value of the type
           in the dictionary a list.
    """
    g = nx.DiGraph()

    if type(properties) == dict:
        iterator = properties.iteritems()
    else:
        iterator = iter(properties)


    for key, value in iterator:
        if type(value) in (list, set, np.ndarray):
            for v in value:
                node_uri = "{2}class=attribute&key={0}&value={1}".format(key, v, prefix)
                g.add_node(node_uri, {
                    'class': 'attribute',
                    'key': key,
                    'value': v,
                    'uri': node_uri
                })
        else:
            node_uri = "{2}class=attribute&key={0}&value={1}".format(key, value, prefix)
            g.add_node(node_uri, {
                'class': 'attribute',
                'key': key,
                'value': value,
                'uri': node_uri
            })

    return g


def validate_uri(uri):
    """

    :param uri: a URI string to be validated
    :return: bool true if valid, false if not
    """
    # TODO: Validate the order properties are in (important for uri hash lookup)

    try:
        properties = urlparse.parse_qs(urlparse.urlparse(uri).query)
    except:
        return False
    if u'key' not in properties:
        return False
    elif len(properties[u'key']) != 1:
        return False
    if u'value' not in properties:
        return False
    elif len(properties[u'value']) != 1:
        return False
    if u'attribute' not in properties:
        return False
    elif len(properties[u'attribute']) != 1:
        return False
    # Nothing failed, return true
    return True


def get_topic_distance(sg, topic):
    """

    :param sg: an egocentric subgraph in networkx format
    :param topic: a networkx graph of nodes representing the topic
    :return: a dictionary of key node name and value distance as integer
    """
    distances = dict()

    # get all the distances
    for tnode in topic.nodes():
        if tnode in sg.nodes():
            distances[tnode] = nx.shortest_path_length(sg, source=tnode)

    # get the smallest distance per key
    min_dist = dict()
    for key in distances:
        for node in distances[key]:
            if node not in min_dist:
                min_dist[node] = distances[key][node]
            elif distances[key][node] < min_dist[node]:
                min_dist[node] = distances[key][node]


    # Return the dict
    return min_dist


def compare_classifications(scores, node1, node2=None, output="print"):
    """

    :param scores: dictionary keyed by nodes and values of scores
    :param node1: dictionary of {"class":<class>, "key":<key>, "value":<value>}
    :param node2: dictionary of {"class":<class>, "key":<key>, "value":<value>}.  If empty, score will be compared to the median
    :param output: string representing how to output the data.  "print" to print it, dictionary otherwise
    :return: ratio of node 1 to node 2 scores normalized to the lower score as dictionary
    """
    node1_uri = "class={0}&key={1}&value={2}".format(node1['class'], node1['key'], node1['value'])

    node1_score = scores[node1_uri]
    if node2 is None:
        node2_score = np.median(scores.values())
    else:
        node2_uri = "class={0}&key={1}&value={2}".format(node2['class'], node2['key'], node2['value'])
        node2_score = scores[node2_uri]

    if node1_score > node2_score:
        larger = "node1"
    else:
        larger = "node2"

    if output == "print":
        if node2 is None:
            if larger == "node2":
                print "The ratio of node 1 ({0}:{1}) to the median ({2}) is {3}:{4}.".format(node1['key'],
                                                                                             node1['value'],
                                                                                             node2_score,
                                                                                             round(node1_score/float(node1_score), 4),
                                                                                             round(node2_score/float(node1_score), 4))
            else:
                print "The ratio of node 1 ({0}:{1}) to the median ({2}) is {3}:{4}.".format(node1['key'],
                                                                                             node1['value'],
                                                                                             node2_score,
                                                                                             round(node1_score/float(node2_score), 4),
                                                                                             round(node2_score/float(node2_score), 4))   
        else:
            if larger == "node2":
                print "The ratio of node 1 ({0}:{1}) to node 2 ({2}:{3}) is {4}:{5}.".format(node1['key'],
                                                                                             node1['value'],
                                                                                             node2['key'],
                                                                                             node2['value'],
                                                                                             round(node1_score/float(node1_score), 4),
                                                                                             round(node2_score/float(node1_score), 4))
            else:
                print "The ratio of node 1 ({0}:{1}) to node 2 ({2}:{3}) is {4}:{5}.".format(node1['key'],
                                                                                             node1['value'],
                                                                                             node2['key'],
                                                                                             node2['value'],
                                                                                             round(node1_score/float(node2_score), 4),
                                                                                             round(node2_score/float(node2_score), 4))        
    else:
        if larger == "node2":
            return {"node1": node1_score/float(node1_score), "node2":node2_score/float(node1_score)}
        else:
            return {"node1": node1_score/float(node2_score), "node2":node2_score/float(node2_score)}


def score_percentile(scores, node, output="print"):
    """

    :param scores: dictionary keyed by nodes and values of scores
    :param node1: dictionary of {"class":<class>, "key":<key>, "value":<value>}
    :param output: string representing how to output the data.  "print" to print it, dictionary otherwise
    :return: the percentile the node is in.  Higher means more likely.ff
    """
    node_uri = "class={0}&key={1}&value={2}".format(node['class'], node['key'], node['value'])

    p =stats.percentileofscore(scores.values(), scores[node_uri])

    if output == "print":
        print "The percentile of the node is {0}.".format(round(p, 4))
    else:
        return p


def merge_graphs(g1, g2):
    """

    """
    g = g1.copy()
    for node, props in g2.nodes(data=True):
        g.add_node(node, props)
    for edge in g2.edges(data=True):
        g.add_edge(edge[0], edge[1], attr_dict=edge[2])

    return g


def removeNonAscii(s): return u"".join(i for i in s if ord(i)<128).encode('utf8')


def remove_non_ascii_from_graph(g):
    """ networkx graph -> networkx graph

    :param g: A networkx graph
    :return: a networkx graph with nonAscii removed from all node and edge attributes
    """
    # ascii safe node key and value
    for node, data in g.nodes(data=True):
        for attr in data.keys():
            data[attr] = removeNonAscii(data[attr])
        g.node[node] = data

    if type(g) in [nx.classes.multidigraph.MultiDiGraph, nx.classes.multigraph.MultiGraph]:
        for edge in g.edges(data=True, keys=True):
            edge_attr = edge[3]
            for attr in edge_attr:
                if type(edge_attr[attr]) is str:
                    edge_attr[attr] = removeNonAscii(edge_attr[attr])
            g.edge[edge[0]][edge[1]][edge[2]] = edge_attr
    else:
        for edge in g.edges(data=True):
            edge_attr = edge[2]
            for attr in edge_attr:
                if type(edge_attr[attr]) is str:
                    edge_attr[attr] = removeNonAscii(edge_attr[attr])
            g.edge[edge[0]][edge[1]] = edge_attr


    # return the safed node
    return g

## MAIN LOOP EXECUTION
def main():
    logging.info('Beginning main loop.')

    logging.info('Ending main loop.')

if __name__ == "__main__":
    main()