python source code of mapmatcher

"""
-------------------------------------------------------------------------------
# Name:        mapMatcher
# Purpose:      This python script allows map matching (matching of track points to a network)
#               in arcpy using a Hidden Markov model with
#               probabilities parameterized based on spatial + network distances.
#               Follows the ideas in Newson, Krumm (2009):
#               "Hidden markov Map Matching through noise and sparseness"
#
#               Example usage under '__main__'
#
# Author:      Simon Scheider
#
# Created:     01/03/2017
# Copyright:   (c) simon 2017
# Licence:     <your licence>

The code is written in Python 2.7 and depends on:

* arcpy (ships with ArcGIS and its own Python 2.7)
* networkx (# python pip install networkx (https://networkx.github.io))
    (note: requires installing GDAL first, which can be obtained as a wheel from
    http://www.lfd.uci.edu/~gohlke/pythonlibs/ and then installed with pip locally:
    python pip install GDAL-2.1.3-cp27-cp27m-win32.whl
    )

#-------------------------------------------------------------------------------
"""

__author__      = "Simon Scheider"
__copyright__   = ""


import sys

try:
    from math import exp, sqrt
    import os
    import arcpy
    arcpy.env.overwriteOutput = True
    import networkx as nx
    import time

except ImportError:
    print "Error: missing one of the libraries (arcpy, networkx)"
    sys.exit()



def mapMatch(track, segments, decayconstantNet = 30, decayConstantEu = 10, maxDist = 50, addfullpath = True):
    """
    The main method. Based on the Viterbi algorithm for Hidden Markov models,
    see https://en.wikipedia.org/wiki/Viterbi_algorithm.
    It gets trackpoints and segments, and returns the most probable segment path (a list of segments) for the list of points.
    Inputs:
        @param track = a shape file (filename) representing a track, can also be unprojected (WGS84)
        @param segments = a shape file of network segments, should be projected (in meter) to compute Euclidean distances properly (e.g. GCS Amersfoord)
        @param decayconstantNet (optional) = the network distance (in meter) after which the match probability falls under 0.34 (exponential decay). (note this is the inverse of lambda).
        This depends on the point frequency of the track (how far are track points separated?)
        @param decayConstantEu (optional) = the Euclidean distance (in meter) after which the match probability falls under 0.34 (exponential decay). (note this is the inverse of lambda).
        This depends on the positional error of the track points (how far can points deviate from their true position?)
        @param maxDist (optional) = the Euclidean distance threshold (in meter) for taking into account segments candidates.
        @param addfullpath (optional, True or False) = whether a contiguous full segment path should be outputted. If not, a 1-to-1 list of segments matching each track point is outputted.

    note: depending on the type of movement, optional parameters need to be fine tuned to get optimal results.
    """
    #Make sure passed in parameters are floats
    decayconstantNet = float(decayconstantNet)
    decayConstantEu = float(decayConstantEu)
    maxDist= float(maxDist)

    #gest start time
    start_time = time.time()

    #this array stores, for each point in a track, probability distributions over segments, together with the (most probable) predecessor segment taking into account a network distance
    V = [{}]

    #get track points, build network graph (graph, endpoints, lengths) and get segment info from arcpy
    points = getTrackPoints(track, segments)
    r = getSegmentInfo(segments)
    endpoints = r[0]
    lengths = r[1]
    graph = getNetworkGraph(segments,lengths)
    pathnodes = [] #set of pathnodes to prevent loops

    #init first point
    sc = getSegmentCandidates(points[0], segments, decayConstantEu, maxDist)
    for s in sc:
        V[0][s] = {"prob": sc[s], "prev": None, "path": [], "pathnodes":[]}
    # Run Viterbi when t > 0
    for t in range(1, len(points)):
        V.append({})
        #Store previous segment candidates
        lastsc = sc
        #Get segment candidates and their a-priori probabilities (based on Euclidean distance for current point t)
        sc = getSegmentCandidates(points[t], segments, decayConstantEu, maxDist)
        for s in sc:
            max_tr_prob = 0
            prev_ss = None
            path = []
            for prev_s in lastsc:
                #determine the highest network transition probability from previous candidates to s and get the corresponding network path
                pathnodes = V[t-1][prev_s]["pathnodes"][-10:]
                n = getNetworkTransP(prev_s, s, graph, endpoints, lengths, pathnodes, decayconstantNet)
                np = n[0] #This is the network transition probability
                tr_prob = V[t-1][prev_s]["prob"]*np
                #this selects the most probable predecessor candidate and the path to it
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_ss = prev_s
                    path = n[1]
                    if n[2] != None:
                        pathnodes.append(n[2])
            #The final probability of a candidate is the product of a-priori and network transitional probability
            max_prob =  sc[s] * max_tr_prob
            V[t][s] = {"prob": max_prob, "prev": prev_ss, "path": path, "pathnodes":pathnodes}

        #Now max standardize all p-values to prevent running out of digits
        maxv = max(value["prob"] for value in V[t].values())
        maxv = (1 if maxv == 0 else maxv)
        for s in V[t].keys():
            V[t][s]["prob"]=V[t][s]["prob"]/maxv


    intertime1 = time.time()
    print("--- Viterbi forward: %s seconds ---" % (intertime1 - start_time))
    #print V

    #opt is the result: a list of (matched) segments [s1, s2, s3,...] in the exact order of the point track: [p1, p2, p3,...]
    opt = []

    # get the highest probability at the end of the track
    max_prob = max(value["prob"] for value in V[-1].values())
    previous = None
    if max_prob == 0:
        print " probabilities fall to zero (network distances in data are too large, try increasing network decay parameter)"

    # Get most probable ending state and its backtrack
    for st, data in V[-1].items():
        if data["prob"] == max_prob:
            opt.append(st)
            previous = st
            break
##    print  " previous: "+str(previous)
##    print  " max_prob: "+str(max_prob)
##    print  " V -1: "+str(V[-1].items())

    # Follow the backtrack till the first observation to fish out most probable states and corresponding paths
    for t in range(len(V) - 2, -1, -1):
        #Get the subpath between last and most probable previous segment and add it to the resulting path
        path = V[t + 1][previous]["path"]
        opt[0:0] =(path if path !=None else [])
        #Insert the previous segment
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]
    intertime2 = time.time()
    print("--- Viterbi backtracking: %s seconds ---" % (intertime2 - intertime1))

    #Clean the path (remove double segments and crossings) (only in full path option)
    print "path length before cleaning :" +str(len(opt))
    opt = cleanPath(opt, endpoints)
    intertime3 = time.time()
    print("--- Path cleaning: %s seconds ---" % (intertime3 - intertime2))
    print "final length: "+str(len(opt))
    pointstr= [str(g.firstPoint.X)+' '+str(g.firstPoint.Y) for g in points]
    optstr= [str(i) for i in opt]
    print 'The path for points ['+' '.join(pointstr)+'] is: '
    print '[' + ' '.join(optstr) + '] with highest probability of %s' % max_prob

    #If only a single segment candidate should be returned for each point:
    if addfullpath == False:
        opt = getpointMatches(points,opt)
        optstr= [str(i) for i in opt]
        print "Individual point matches: "+'[' + ' '.join(optstr) + ']'
        intertime4 = time.time()
        print("--- Picking point matches: %s seconds ---" % (intertime4 - intertime3))

    return opt

#Fishes out a 1-to-1 list of path segments nearest to the list of points in the track (not contiguous, may contain repeated segments)
def getpointMatches(points, path):
    qr =  '"OBJECTID" IN ' +str(tuple(path))
    arcpy.SelectLayerByAttribute_management('segments_lyr',"NEW_SELECTION", qr)
    opta = []
    for point in points:
        sdist = 100000
        candidate = ''
        cursor = arcpy.da.SearchCursor('segments_lyr', ["OBJECTID", "SHAPE@"])
        for row in cursor:
            #compute the spatial distance
            dist = point.distanceTo(row[1])
            if dist <sdist:
                sdist=dist
                candidate = row[0]
        opta.append(candidate)
    del cursor
    #print str(candidates)
    return opta



def simplisticMatch(track, segments, maxDist = 50):
    maxDist= float(maxDist)
    #get track points, build network graph (graph, endpoints, lengths) and get segment info from arcpy
    points = getTrackPoints(track, segments)
    opt =[]
    for t in range(1, len(points)):
        point = points[t]
        s = getClosestSegment(point, segments,maxDist)
        opt.append(s)

    return opt



def cleanPath(opt, endpoints):
    # removes redundant segments and segments that are unnecessary to form a path (crossings) in an iterative manner
    last =()
    lastlast =()
    optout = []
    for s in opt:
        if s != last:
            match = False
            if last != () and lastlast != ():
                lastep = endpoints[last]
                lastlastep = endpoints[lastlast]
                sep = endpoints[s]
                for j in lastlastep:
                    if lastep[0]== j:
                        for k in sep:
                            if lastep[1] == k:
                                match = True
                    elif lastep[1]== j:
                        for k in sep:
                            if lastep[0] == k:
                                match = True
            elif last != ():
                sep = endpoints[s]
                lastep = endpoints[last]
                for k in sep:
                    if lastep[1] == k or lastep[0] == k:
                        match = True
            if match:
                optout.append(last)
            if s == opt[-1]:
                #print "add final segment:"+str(s)
                optout.append(s)
        lastlast = last
        last = s
    #print "final length: "+str(len(optout))
    return optout

def getUniqueList(my_list):
    from collections import OrderedDict
    from itertools import izip, repeat

    unique_list = list(OrderedDict(izip(my_list, repeat(None))))
    return unique_list


def exportPath(opt, trackname):
    """
    This exports the list of segments into a shapefile, a subset of the loaded segment file, including all attributes
    """
    start_time = time.time()
    opt=getUniqueList(opt)
    qr =  '"OBJECTID" IN ' +str(tuple(opt))
    outname = (os.path.splitext(os.path.basename(trackname))[0][:9])+'_pth'
    arcpy.SelectLayerByAttribute_management('segments_lyr',"NEW_SELECTION", qr)
    try:
        if arcpy.Exists(outname):
            arcpy.Delete_management(outname)
        arcpy.FeatureClassToFeatureClass_conversion('segments_lyr', arcpy.env.workspace, outname)
        print("--- export: %s seconds ---" % (time.time() - start_time))
    except Exception:
        e = sys.exc_info()[1]
        print(e.args[0])

        # If using this code within a script tool, AddError can be used to return messages
        #   back to a script tool.  If not, AddError will have no effect.
        arcpy.AddError(e.args[0])
        arcpy.AddError(arcpy.env.workspace)
        arcpy.AddError(outname)
        #raise arcpy.ExecuteError
    except arcpy.ExecuteError:
        arcpy.AddError(arcpy.GetMessages(2))

    # Return any other type of error
    except:
        # By default any other errors will be caught here
        #
        e = sys.exc_info()[1]
        print(e.args[0])
        arcpy.AddError(e.args[0])
        arcpy.AddError(arcpy.env.workspace)
        arcpy.AddError(outname)


def getPDProbability(dist, decayconstant = 10):
    """
    The probability that given a certain distance between points and segments, the point is on the segment
    This needs to be parameterized
    Turn difference into a probability with exponential decay function
    """
    decayconstant= float(decayconstant)
    dist= float(dist)
    try:
        p = 1 if dist == 0 else round(1/exp(dist/decayconstant),4)
    except OverflowError:
        p =  round(1/float('inf'),2)
    return p

def getSegmentCandidates(point, segments, decayConstantEu, maxdist=50):
    """
    Returns closest segment candidates with a-priori probabilities.
    Based on maximal spatial distance of segments from point.
    """
    p = point.firstPoint #get the coordinates of the point geometry
    #print "Neighbors of point "+str(p.X) +' '+ str(p.Y)+" : "
    #Select all segments within max distance
    arcpy.SelectLayerByLocation_management ("segments_lyr", "WITHIN_A_DISTANCE", point, maxdist)
    candidates = {}
    #Go through these, compute distances, probabilities and store them as candidates
    cursor = arcpy.da.SearchCursor('segments_lyr', ["OBJECTID", "SHAPE@"])
    row =[]
    for row in cursor:
        feat = row[1]
        #compute the spatial distance
        dist = point.distanceTo(row[1])
        #compute the corresponding probability
        candidates[row[0]] = getPDProbability(dist, decayConstantEu)
    del row
    del cursor
    #print str(candidates)
    return candidates

def getClosestSegment(point, segments, maxdist):
    arcpy.Delete_management('segments_lyr')
    arcpy.MakeFeatureLayer_management(segments, 'segments_lyr')
    arcpy.SelectLayerByLocation_management ("segments_lyr", "WITHIN_A_DISTANCE", point, maxdist)

    #Go through these, compute distances, probabilities and store them as candidates
    cursor = arcpy.da.SearchCursor('segments_lyr', ["OBJECTID", "SHAPE@"])
    sdist = 100000
    candidate = ''
    for row in cursor:
        #compute the spatial distance
        dist = point.distanceTo(row[1])
        if dist <sdist:
            sdist=dist
            candidate = row[0]
    del row
    del cursor
    #print str(candidates)
    return candidate


def getNDProbability(dist,decayconstant = 30):
    """
    The probability that given a certain network distance between segments, one is the successor of the other in a track
    This needs to be parameterized
    Turn difference into a probability  with exponential decay function
    """
    decayconstant = float(decayconstant)
    dist = float(dist)
    try:
        p = 1 if dist == 0 else  round(1/exp(dist/decayconstant),2)
    except OverflowError:
        p =  round(1/float('inf'),2)
    return p

def getNetworkTransP(s1, s2, graph, endpoints, segmentlengths, pathnodes, decayconstantNet):
    """
    Returns transition probability of going from segment s1 to s2, based on network distance of segments, as well as corresponding path
    """
    subpath = []
    s1_point = None
    s2_point = None

    if s1 == s2:
        dist = 0
    else:
        #Obtain edges (tuples of endpoints) for segment identifiers
        s1_edge = endpoints[s1]
        s2_edge = endpoints[s2]

        s1_l = segmentlengths[s1]
        s2_l = segmentlengths[s2]

        #This determines segment endpoints of the two segments that are closest to each other
        minpair = [0,0,100000]
        for i in range(0,2):
            for j in range(0,2):
                d = round(pointdistance(s1_edge[i],s2_edge[j]),2)
                if d<minpair[2]:
                    minpair = [i,j,d]
        s1_point = s1_edge[minpair[0]]
        s2_point = s2_edge[minpair[1]]

##        if (s1_point in pathnodes or s2_point in pathnodes): # Avoid paths reusing an old pathnode (to prevent self-crossings)
##            dist = 100
##        else:
        if s1_point == s2_point:
                #If segments are touching, use a small network distance
                    dist = 5
        else:
                try:
                    #Compute a shortest path (using segment length) on graph where segment endpoints are nodes and segments are (undirected) edges
                    if graph.has_node(s1_point) and graph.has_node(s2_point):
                        dist = nx.shortest_path_length(graph, s1_point, s2_point, weight='length')
                        path = nx.shortest_path(graph, s1_point, s2_point, weight='length')
                        #get path edges
                        path_edges = zip(path,path[1:])
                        #print "edges: "+str(path_edges)
                        subpath = []
                        # get object ids for path edges
                        for e in path_edges:
                            oid = graph[e[0]][e[1]]["OBJECTID"]
                            subpath.append(oid)
                        #print "oid path:"+str(subpath)
                    else:
                        #print "node not in segment graph!"
                        dist = 3*decayconstantNet #600
                except nx.NetworkXNoPath:
                    #print 'no path available, assume a large distance'
                    dist = 3*decayconstantNet #700
    #print "network distance between "+str(s1) + ' and '+ str(s2) + ' = '+str(dist)
    return (getNDProbability(dist,decayconstantNet),subpath,s2_point)

def pointdistance(p1, p2):
    #This Eucl distance can only be used for projected coordinate systems
    dist = sqrt((p1[0]-p2[0])**2 +(p1[1]-p2[1])**2)
    return dist

def getTrackPoints(track, segments):
    """
    Turns track shapefile into a list of point geometries, reprojecting to the planar RS of the network file
    """
    trackpoints = []
    if arcpy.Exists(track):
        for row in arcpy.da.SearchCursor(track, ["SHAPE@"]):
            #make sure track points are reprojected to network reference system (should be planar)
            geom = row[0]
            #geom = row[0].projectAs(arcpy.Describe(segments).spatialReference)
            trackpoints.append(row[0])
        print 'track size:' + str(len(trackpoints))
        return trackpoints
    else:
        print "Track file does not exist!"

def getNetworkGraph(segments,segmentlengths):
    """
    Builds a networkx graph from the network file, inluding segment length taken from arcpy.
    It selects the largest connected component of the network (to prevent errors from routing between unconnected parts)
    """
    #generate the full network path for GDAL to be able to read the file
    path =str(os.path.join(arcpy.env.workspace,segments))
    print path
    if arcpy.Exists(path):
        g = nx.read_shp(path)
        #This selects the largest connected component of the graph
        sg = list(nx.connected_component_subgraphs(g.to_undirected()))[0]
        print "graph size (excluding unconnected parts): "+str(len(g))
        # Get the length for each road segment and append it as an attribute to the edges in the graph.
        for n0, n1 in sg.edges():
            oid = sg[n0][n1]["OBJECTID"]
            sg[n0][n1]['length'] = segmentlengths[oid]
        return sg
    else:
        print "network file not found on path: "+path

def getSegmentInfo(segments):
    """
    Builds a dictionary for looking up endpoints of network segments (needed only because networkx graph identifies edges by nodes)
    """
    if arcpy.Exists(segments):
        cursor = arcpy.da.SearchCursor(segments, ["OBJECTID", "SHAPE@"])
        endpoints = {}
        segmentlengths = {}
        for row in cursor:
              endpoints[row[0]]=((row[1].firstPoint.X,row[1].firstPoint.Y), (row[1].lastPoint.X, row[1].lastPoint.Y))
              segmentlengths[row[0]]= row[1].length
        del row
        del cursor
        print "Number of segments: "+ str(len(endpoints))
        #prepare segment layer for fast search
        arcpy.Delete_management('segments_lyr')
        arcpy.MakeFeatureLayer_management(segments, 'segments_lyr')
        return (endpoints,segmentlengths)
    else:
        print "segment file does not exist!"


if __name__ == '__main__':

##    #Test using the shipped data example
    arcpy.env.workspace = 'C:\\Users\\schei008\\Documents\\Github\\mapmatching'
    opt = mapMatch('testTrack.shp', 'testSegments.shp', 25, 10, 50)
    #outputs testTrack_path.shp
    exportPath(opt, 'testTrack.shp')

##    arcpy.env.workspace = 'C:\\Temp\\Road_293162'
##    opt = mapMatch('Track293162.shp', 'Road_2931_corr.shp', 300, 10, 50)
##    #outputs testTrack_path.shp
##    exportPath(opt, 'Track293162.shp')


##    arcpy.env.workspace = 'C:/Users/simon/Documents/GitHub/mapmatching/'
##    trackname ='QT170212C.shp'
##    roadname ='Roads2.shp'
##    opt = mapMatch(trackname, roadname, 20, 10, 50)
##    exportPath(opt, trackname)
##
##    arcpy.env.workspace = 'C:\\Temp\\Simon.gdb\\test'
##
##    trackname ='Tom170218.shp'
##    roadname ='TrkRoads_Tom1.shp'
##    print trackname
##    print roadname
##    opt = mapMatch(trackname, roadname, 20, 10, 50)
##    exportPath(opt, trackname)
##
##    trackname ='Tom170218_2.shp'
##    roadname ='TrkRoads_Tom2.shp'
##    print trackname
##    print roadname
##    opt = mapMatch(trackname, roadname, 20, 10, 50)
##    exportPath(opt, trackname)
##
####    trackname ='Maarten150318.shp'
####    roadname ='TrkRoads_Maarten.shp'
####    print trackname
####    print roadname
####    opt = mapMatch(trackname, roadname, 20, 10, 50)
####    exportPath(opt, trackname)
##
##    trackname ='Nico160706_ss.shp'
##    roadname ='TrkRoads_Nico.shp'
##    print trackname
##    print roadname
##    opt = mapMatch(trackname, roadname, 60, 40, 50)
##    exportPath(opt, trackname)
##    opt = simplisticMatch(trackname, roadname,50)
##    exportPath(opt, 'Nico_s')