python source code of io

"""Serialize graphs to/from files on disk."""

import ast
import os
from xml.etree import ElementTree as etree

import networkx as nx
import numpy as np
import pandas as pd
from shapely import wkt

from . import settings
from . import utils
from . import utils_graph


def save_graph_geopackage(G, filepath=None, encoding="utf-8"):
    """
    Save graph nodes and edges to disk as layers in a GeoPackage file.

    Parameters
    ----------
    G : networkx.MultiDiGraph
        input graph
    filepath : string
        path to the GeoPackage file including extension. if None, use default
        data folder + graph.gpkg
    encoding : string
        the character encoding for the saved file

    Returns
    -------
    None
    """
    # default filepath if none was provided
    if filepath is None:
        filepath = os.path.join(settings.data_folder, "graph.gpkg")

    # if save folder does not already exist, create it
    folder, filename = os.path.split(filepath)
    if not folder == "" and not os.path.exists(folder):
        os.makedirs(folder)

    # convert undirected graph to gdfs and stringify non-numeric columns
    gdf_nodes, gdf_edges = utils_graph.graph_to_gdfs(utils_graph.get_undirected(G))
    gdf_nodes = _stringify_nonnumeric_cols(gdf_nodes)
    gdf_edges = _stringify_nonnumeric_cols(gdf_edges)

    # save the nodes and edges as GeoPackage layers
    gdf_nodes.to_file(filepath, layer="nodes", driver="GPKG", encoding=encoding)
    gdf_edges.to_file(filepath, layer="edges", driver="GPKG", encoding=encoding)
    utils.log(f'Saved graph as GeoPackage at "{filepath}"')


def save_graph_shapefile(G, filepath=None, encoding="utf-8"):
    """
    Save graph nodes and edges to disk as ESRI shapefiles.

    The shapefile format is proprietary and outdated. Whenever possible, you
    should use the superior GeoPackage file format instead, for instance, via
    the save_graph_geopackage function.

    Parameters
    ----------
    G : networkx.MultiDiGraph
        input graph
    filepath : string
        path to the shapefiles folder (no file extension). if None, use
        default data folder + graph_shapefile
    encoding : string
        the character encoding for the saved files

    Returns
    -------
    None
    """
    # default filepath if none was provided
    if filepath is None:
        filepath = os.path.join(settings.data_folder, "graph_shapefile")

    # if save folder does not already exist, create it (shapefiles
    # get saved as set of files)
    if not filepath == "" and not os.path.exists(filepath):
        os.makedirs(filepath)
    filepath_nodes = os.path.join(filepath, "nodes.shp")
    filepath_edges = os.path.join(filepath, "edges.shp")

    # convert undirected graph to gdfs and stringify non-numeric columns
    gdf_nodes, gdf_edges = utils_graph.graph_to_gdfs(utils_graph.get_undirected(G))
    gdf_nodes = _stringify_nonnumeric_cols(gdf_nodes)
    gdf_edges = _stringify_nonnumeric_cols(gdf_edges)

    # save the nodes and edges as separate ESRI shapefiles
    gdf_nodes.to_file(filepath_nodes, encoding=encoding)
    gdf_edges.to_file(filepath_edges, encoding=encoding)
    utils.log(f'Saved graph as shapefiles at "{filepath}"')


def save_graphml(G, filepath=None, gephi=False, encoding="utf-8"):
    """
    Save graph to disk as GraphML file.

    Parameters
    ----------
    G : networkx.MultiDiGraph
        input graph
    filepath : string
        path to the GraphML file including extension. if None, use
        default data folder + graph.graphml
    gephi : bool
        if True, give each edge a unique key to work around Gephi's
        restrictive interpretation of the GraphML specification
    encoding : string
        the character encoding for the saved file

    Returns
    -------
    None
    """
    # default filepath if none was provided
    if filepath is None:
        filepath = os.path.join(settings.data_folder, "graph.graphml")

    # if save folder does not already exist, create it
    folder, filename = os.path.split(filepath)
    if not folder == "" and not os.path.exists(folder):
        os.makedirs(folder)

    # create a copy to convert all the node/edge attribute values to string
    G_save = G.copy()

    if gephi:

        gdf_nodes, gdf_edges = utils_graph.graph_to_gdfs(G_save)

        # turn each edge's key into a unique ID for Gephi compatibility
        gdf_edges["key"] = range(len(gdf_edges))

        # gephi doesn't handle node attrs named x and y well, so rename
        gdf_nodes["xcoord"] = gdf_nodes["x"]
        gdf_nodes["ycoord"] = gdf_nodes["y"]
        G_save = utils_graph.graph_from_gdfs(gdf_nodes, gdf_edges)

        # remove graph attributes as Gephi only accepts node and edge attrs
        G_save.graph = {}

    else:
        # if not gephi, keep graph attrs and stringify them
        for dict_key in G_save.graph:
            # convert all the graph attribute values to strings
            G_save.graph[dict_key] = str(G_save.graph[dict_key])

    # stringify node and edge attributes
    for _, data in G_save.nodes(data=True):
        for dict_key in data:
            if gephi and dict_key in {"xcoord", "ycoord"}:
                # don't convert x y values to string if saving for gephi
                continue
            else:
                # convert all the node attribute values to strings
                data[dict_key] = str(data[dict_key])

    for _, _, data in G_save.edges(keys=False, data=True):
        for dict_key in data:
            # convert all the edge attribute values to strings
            data[dict_key] = str(data[dict_key])

    nx.write_graphml(G_save, path=filepath, encoding=encoding)
    utils.log(f'Saved graph as GraphML file at "{filepath}"')


def load_graphml(filepath, node_type=int):
    """
    Load an OSMnx-saved GraphML file from disk.

    Converts the node/edge attributes to appropriate data types.

    Parameters
    ----------
    filepath : string
        path to the GraphML file
    node_type : type
        convert node ids to this data type

    Returns
    -------
    G : networkx.MultiDiGraph
    """
    # read the graph from disk
    G = nx.MultiDiGraph(nx.read_graphml(filepath, node_type=node_type))

    # convert node/edge attribute data types
    utils.log("Converting node and edge attribute data types")
    G = _convert_node_attr_types(G, node_type)
    G = _convert_edge_attr_types(G, node_type)

    # convert graph crs attribute from saved string to correct dict data type
    # if it is a stringified dict rather than a proj4 string
    if "crs" in G.graph and G.graph["crs"].startswith("{") and G.graph["crs"].endswith("}"):
        G.graph["crs"] = ast.literal_eval(G.graph["crs"])

    if "streets_per_node" in G.graph:
        G.graph["streets_per_node"] = ast.literal_eval(G.graph["streets_per_node"])

    # remove node_default and edge_default metadata keys if they exist
    if "node_default" in G.graph:
        del G.graph["node_default"]
    if "edge_default" in G.graph:
        del G.graph["edge_default"]

    utils.log(f'Loaded graph with {len(G)} nodes and {len(G.edges())} edges from "{filepath}"')
    return G


def _convert_node_attr_types(G, node_type):
    """
    Convert graph nodes' attributes' types from string to numeric.

    Parameters
    ----------
    G : networkx.MultiDiGraph
        input graph
    node_type : type
        convert node ID (osmid) to this type

    Returns
    -------
    G : networkx.MultiDiGraph
    """
    for _, data in G.nodes(data=True):

        # convert node ID to user-requested type
        data["osmid"] = node_type(data["osmid"])

        # convert numeric node attributes from string to float
        for attr in {"elevation", "elevation_res", "lat", "lon", "x", "y"}:
            if attr in data:
                data[attr] = float(data[attr])

    return G


def _convert_edge_attr_types(G, node_type):
    """
    Convert graph edges' attributes' types from string to numeric.

    Parameters
    ----------
    G : networkx.MultiDiGraph
        input graph
    node_type : type
        convert osmid to this type

    Returns
    -------
    G : networkx.MultiDiGraph
    """
    # convert numeric, bool, and list edge attributes from string to correct data types
    for _, _, data in G.edges(data=True, keys=False):

        # parse length to float: should always have only 1 value
        data["length"] = float(data["length"])

        try:
            data["oneway"] = ast.literal_eval(data["oneway"])
            raise ValueError()
        except (KeyError, ValueError):
            # may lack oneway if settings.all_oneway=True when you created
            # graph, or values it can't eval if settings.all_oneway=True
            pass

        # convert to float any possible OSMnx-added edge attributes, which may
        # have multiple values if graph was simplified after they were added
        for attr in {"grade", "grade_abs", "bearing", "speed_kph", "travel_time"}:
            if attr in data:
                if data[attr].startswith("[") and data[attr].endswith("]"):
                    # if it's a list, eval it then convert each item to float
                    data[attr] = [float(a) for a in ast.literal_eval(data[attr])]
                else:
                    data[attr] = float(data[attr])

        # these attributes might have a single value, or a list if edge's
        # topology was simplified
        for attr in {
            "highway",
            "name",
            "bridge",
            "tunnel",
            "lanes",
            "ref",
            "maxspeed",
            "service",
            "access",
            "area",
            "landuse",
            "width",
            "est_width",
        }:
            # if this edge has this attribute, and it starts with '[' and ends
            # with ']', then it's a list to be parsed
            if attr in data and data[attr].startswith("[") and data[attr].endswith("]"):
                # try to convert the string list to a list type, else leave as
                # single-value string (and leave as string if error)
                try:
                    data[attr] = ast.literal_eval(data[attr])
                except Exception:
                    pass

        # osmid might have a single value or a list
        if "osmid" in data:
            if data["osmid"].startswith("[") and data["osmid"].endswith("]"):
                # if it's a list, eval list then convert each element to node_type
                data["osmid"] = [node_type(i) for i in ast.literal_eval(data["osmid"])]
            else:
                # if it's not a list, convert it to the node_type
                data["osmid"] = node_type(data["osmid"])

        # if geometry attribute exists, load the string as well-known text to
        # shapely LineString
        if "geometry" in data:
            data["geometry"] = wkt.loads(data["geometry"])

        # delete the extraneous id attribute added by graphml saving
        if "id" in data:
            del data["id"]

    return G


def _stringify_nonnumeric_cols(gdf):
    """
    Make every non-numeric GeoDataFrame column (besides geometry) a string.

    This allows proper serializing via Fiona of GeoDataFrames with mixed types
    such as strings and ints in the same column.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        gdf to stringify non-numeric columns of

    Returns
    -------
    gdf : geopandas.GeoDataFrame
        gdf with non-numeric columns stringified
    """
    # stringify every non-numeric column other than geometry column
    for col in (c for c in gdf.columns if not c == "geometry"):
        if not pd.api.types.is_numeric_dtype(gdf[col]):
            gdf[col] = gdf[col].fillna("").astype(str)

    return gdf


def save_graph_xml(
    data,
    filepath=None,
    node_tags=settings.osm_xml_node_tags,
    node_attrs=settings.osm_xml_node_attrs,
    edge_tags=settings.osm_xml_way_tags,
    edge_attrs=settings.osm_xml_way_attrs,
    oneway=False,
    merge_edges=True,
    edge_tag_aggs=None,
):
    """
    Save graph to disk as an OSM-formatted XML .osm file.

    Note: for large networks this function can take a long time to run. Before
    using this function, make sure you configured OSMnx as described in the
    example below when you created the graph.

    Example
    -------
    >>> import osmnx as ox
    >>> utn = ox.settings.useful_tags_node
    >>> oxna = ox.settings.osm_xml_node_attrs
    >>> oxnt = ox.settings.osm_xml_node_tags
    >>> utw = ox.settings.useful_tags_way
    >>> oxwa = ox.settings.osm_xml_way_attrs
    >>> oxwt = ox.settings.osm_xml_way_tags
    >>> utn = list(set(utn + oxna + oxnt))
    >>> utw = list(set(utw + oxwa + oxwt))
    >>> ox.config(all_oneway=True, useful_tags_node=utn, useful_tags_way=utw)
    >>> G = ox.graph_from_place('Piedmont, CA, USA', network_type='drive')
    >>> ox.save_graph_xml(G, filepath='./data/graph1.osm')

    Parameters
    ----------
    data : networkx multi(di)graph OR a length 2 iterable of nodes/edges
        geopandas GeoDataFrames
    filepath : string
        path to the .osm file including extension. if None, use default data
        folder + graph.osm
    node_tags : list
        osm node tags to include in output OSM XML
    node_attrs: list
        osm node attributes to include in output OSM XML
    edge_tags : list
        osm way tags to include in output OSM XML
    edge_attrs : list
        osm way attributes to include in output OSM XML
    oneway : bool
        the default oneway value used to fill this tag where missing
    merge_edges : bool
        if True merges graph edges such that each OSM way has one entry
        and one entry only in the OSM XML. Otherwise, every OSM way
        will have a separate entry for each node pair it contains.
    edge_tag_aggs : list of length-2 string tuples
        useful only if merge_edges is True, this argument allows the user
        to specify edge attributes to aggregate such that the merged
        OSM way entry tags accurately represent the sum total of
        their component edge attributes. For example, if the user
        wants the OSM way to have a "length" attribute, the user must
        specify `edge_tag_aggs=[('length', 'sum')]` in order to tell
        this method to aggregate the lengths of the individual
        component edges. Otherwise, the length attribute will simply
        reflect the length of the first edge associated with the way.

    Returns
    -------
    None
    """
    # default filepath if none was provided
    if filepath is None:
        filepath = os.path.join(settings.data_folder, "graph.osm")

    # if save folder does not already exist, create it
    folder, filename = os.path.split(filepath)
    if not folder == "" and not os.path.exists(folder):
        os.makedirs(folder)

    if not settings.all_oneway:
        raise UserWarning(
            "In order for save_graph_osm to behave properly "
            "the graph must have been created with the "
            "`all_oneway` setting set to True."
        )

    try:
        gdf_nodes, gdf_edges = data
    except ValueError:
        gdf_nodes, gdf_edges = utils_graph.graph_to_gdfs(
            data, node_geometry=False, fill_edge_geometry=False
        )

    # rename columns per osm specification
    gdf_nodes.rename(columns={"osmid": "id", "x": "lon", "y": "lat"}, inplace=True)
    if "id" in gdf_edges.columns:
        gdf_edges = gdf_edges[[col for col in gdf_edges if col != "id"]]
    if "uniqueid" in gdf_edges.columns:
        gdf_edges = gdf_edges.rename(columns={"uniqueid": "id"})
    else:
        gdf_edges = gdf_edges.reset_index().rename(columns={"index": "id"})

    # add default values for required attributes
    for table in (gdf_nodes, gdf_edges):
        table["uid"] = "1"
        table["user"] = "osmnx"
        table["version"] = "1"
        table["changeset"] = "1"
        table["timestamp"] = "2017-01-01T00:00:00Z"

    # convert all datatypes to str
    gdf_nodes = gdf_nodes.applymap(str)
    gdf_edges = gdf_edges.applymap(str)

    # misc. string replacements to meet OSM XML spec
    if "oneway" in gdf_edges.columns:

        # fill blank oneway tags with default (False)
        gdf_edges.loc[pd.isnull(gdf_edges["oneway"]), "oneway"] = oneway
        gdf_edges.loc[:, "oneway"] = gdf_edges["oneway"].astype(str)
        gdf_edges.loc[:, "oneway"] = (
            gdf_edges["oneway"].str.replace("False", "no").replace("True", "yes")
        )

    # initialize XML tree with an OSM root element then append nodes/edges
    root = etree.Element("osm", attrib={"version": "1", "generator": "OSMnx"})
    root = _append_nodes_xml_tree(root, gdf_nodes, node_attrs, node_tags)
    root = _append_edges_xml_tree(
        root, gdf_edges, edge_attrs, edge_tags, edge_tag_aggs, merge_edges
    )

    # write to disk
    etree.ElementTree(root).write(filepath)
    utils.log(f'Saved graph as .osm file at "{filepath}"')


def _append_nodes_xml_tree(root, gdf_nodes, node_attrs, node_tags):
    """
    Append nodes to an XML tree.

    Parameters
    ----------
    root : ElementTree.Element
        xml tree
    gdf_nodes : geopandas.GeoDataFrame
        GeoDataFrame of graph nodes
    node_attrs : list
        osm way attributes to include in output OSM XML
    node_tags : list
        osm way tags to include in output OSM XML

    Returns
    -------
    root : ElementTree.Element
        xml tree with nodes appended
    """
    for _, row in gdf_nodes.iterrows():
        node = etree.SubElement(root, "node", attrib=row[node_attrs].dropna().to_dict())
        for tag in node_tags:
            if tag in gdf_nodes.columns:
                etree.SubElement(node, "tag", attrib={"k": tag, "v": row[tag]})
    return root


def _append_edges_xml_tree(root, gdf_edges, edge_attrs, edge_tags, edge_tag_aggs, merge_edges):
    """
    Append edges to an XML tree.

    Parameters
    ----------
    root : ElementTree.Element
        xml tree
    gdf_edges : geopandas.GeoDataFrame
        GeoDataFrame of graph edges
    edge_attrs : list
        osm way attributes to include in output OSM XML
    edge_tags : list
        osm way tags to include in output OSM XML
    edge_tag_aggs : list of length-2 string tuples
        useful only if merge_edges is True, this argument allows the user
        to specify edge attributes to aggregate such that the merged
        OSM way entry tags accurately represent the sum total of
        their component edge attributes. For example, if the user
        wants the OSM way to have a "length" attribute, the user must
        specify `edge_tag_aggs=[('length', 'sum')]` in order to tell
        this method to aggregate the lengths of the individual
        component edges. Otherwise, the length attribute will simply
        reflect the length of the first edge associated with the way.
    merge_edges : bool
        if True merges graph edges such that each OSM way has one entry
        and one entry only in the OSM XML. Otherwise, every OSM way
        will have a separate entry for each node pair it contains.

    Returns
    -------
    root : ElementTree.Element
        xml tree with edges appended
    """
    if merge_edges:

        for e in gdf_edges["id"].unique():
            all_way_edges = gdf_edges[gdf_edges["id"] == e]
            first = all_way_edges.iloc[0]
            edge = etree.SubElement(root, "way", attrib=first[edge_attrs].dropna().to_dict())

            if len(all_way_edges) == 1:
                etree.SubElement(edge, "nd", attrib={"ref": first["u"]})
                etree.SubElement(edge, "nd", attrib={"ref": first["v"]})
            else:
                # topological sort
                ordered_nodes = _get_unique_nodes_ordered_from_way(all_way_edges)
                for node in ordered_nodes:
                    etree.SubElement(edge, "nd", attrib={"ref": node})

            if edge_tag_aggs is None:
                for tag in edge_tags:
                    if tag in all_way_edges.columns:
                        etree.SubElement(edge, "tag", attrib={"k": tag, "v": first[tag]})
            else:
                for tag in edge_tags:
                    if (tag in all_way_edges.columns) and (
                        tag not in (t for t, agg in edge_tag_aggs)
                    ):
                        etree.SubElement(edge, "tag", attrib={"k": tag, "v": first[tag]})

                for tag, agg in edge_tag_aggs:
                    if tag in all_way_edges.columns:
                        etree.SubElement(
                            edge, "tag", attrib={"k": tag, "v": all_way_edges[tag].aggregate(agg)}
                        )
    else:
        # NOTE: this will generate separate OSM ways for each network edge,
        # even if the edges are all part of the same original OSM way. As
        # such, each way will be comprised of two nodes, and there will be
        # many ways with the same OSM id. This does not conform to the
        # OSM XML schema standard, however, the data will still comprise a
        # valid network and will be readable by *most* OSM tools.
        for _, row in gdf_edges.iterrows():
            edge = etree.SubElement(root, "way", attrib=row[edge_attrs].dropna().to_dict())
            etree.SubElement(edge, "nd", attrib={"ref": row["u"]})
            etree.SubElement(edge, "nd", attrib={"ref": row["v"]})
            for tag in edge_tags:
                if tag in gdf_edges.columns:
                    etree.SubElement(edge, "tag", attrib={"k": tag, "v": row[tag]})

    return root


def _get_unique_nodes_ordered_from_way(df_way_edges):
    """
    Recover original node order from df of edges associated w/ single OSM way.

    Parameters
    ----------
    df_way_edges : pandas.DataFrame
        Dataframe containing columns 'u' and 'v' corresponding to
        origin/destination nodes.

    Returns
    -------
    unique_ordered_nodes : list
        An ordered list of unique node IDs.
        Note: If the edges do not all connect (e.g. [(1, 2), (2,3),
        (10, 11), (11, 12), (12, 13)]), then this method will return
        only those nodes associated with the largest component of
        connected edges, even if subsequent connected chunks are contain
        more total nodes. This is done to ensure a proper topological
        representation of nodes in the XML way records because if there
        are unconnected components, the sorting algorithm cannot recover
        their original order. We would not likely ever encounter this
        kind of disconnected structure of nodes within a given way, but
        it is not explicitly forbidden in the OSM XML design schema.
    """
    G = nx.MultiDiGraph()
    all_nodes = list(df_way_edges["u"].values) + list(df_way_edges["v"].values)

    G.add_nodes_from(all_nodes)
    G.add_edges_from(df_way_edges[["u", "v"]].values)

    # copy nodes into new graph
    H = utils_graph.get_largest_component(G, strongly=False)
    unique_ordered_nodes = list(nx.topological_sort(H))
    num_unique_nodes = len(np.unique(all_nodes))

    if len(unique_ordered_nodes) < num_unique_nodes:
        utils.log(f"Recovered order for {len(unique_ordered_nodes)} of {num_unique_nodes} nodes")

    return unique_ordered_nodes