python source code of __init_

#!/usr/bin/env python
"""Provides utilities for aligning graphs."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import numpy as np
import networkx as nx
from eden.graph import vertex_vectorize
from sklearn.neighbors import NearestNeighbors
from itertools import product
from collections import defaultdict
from eden.display import draw_graph

import logging


def stable(rankings, list_a, list_b):
    """Compute stable alignment.

    rankings[(a, n)] = partner that a ranked n^th
    >>> from itertools import product
    >>> A = ['1','2','3','4','5','6']
    >>> B = ['a','b','c','d','e','f']
    >>> rank = dict()
    >>> rank['1'] = (1,4,2,6,5,3)
    >>> rank['2'] = (3,1,2,4,5,6)
    >>> rank['3'] = (1,2,4,3,5,6)
    >>> rank['4'] = (4,1,2,5,3,6)
    >>> rank['5'] = (1,2,3,6,4,5)
    >>> rank['6'] = (2,1,4,3,5,6)
    >>> rank['a'] = (1,2,3,4,5,6)
    >>> rank['b'] = (2,1,4,3,5,6)
    >>> rank['c'] = (5,1,6,3,2,4)
    >>> rank['d'] = (1,3,2,5,4,6)
    >>> rank['e'] = (4,1,3,6,2,5)
    >>> rank['f'] = (2,1,4,3,6,5)
    >>> Arankings = dict(((a, rank[a][b_]), B[b_]) for (a, b_) in product(A, range(0, 6)))
    >>> Brankings = dict(((b, rank[b][a_]), A[a_]) for (b, a_) in product(B, range(0, 6)))
    >>> rankings = Arankings
    >>> rankings.update(Brankings)
    >>> stable(rankings, A, B)
    [('1', 'a'), ('2', 'b'), ('3', 'd'), ('4', 'f'), ('5', 'c'), ('6', 'e')]
    """
    partners = dict((a, (rankings[(a, 1)], 1)) for a in list_a)
    # whether the current pairing (given by `partners`) is stable
    is_stable = False
    while is_stable is False:
        is_stable = True
        for b in list_b:
            is_paired = False  # whether b has a pair which b ranks <= to n
            for n in range(1, len(list_b) + 1):
                a = rankings[(b, n)]
                a_partner, a_n = partners[a]
                if a_partner == b:
                    if is_paired:
                        is_stable = False
                        partners[a] = (rankings[(a, a_n + 1)], a_n + 1)
                    else:
                        is_paired = True
    stable_list = sorted((a, b) for (a, (b, n)) in partners.items())
    return stable_list


def init_vec(orig_graph):
    graph = orig_graph.copy()
    for u in graph.nodes():
        graph.node[u]['vec'] = []
    return graph


def annotate_with_bfs(orig_graph, start, max_depth=20):
    graph = orig_graph.copy()
    for u in graph.nodes():
        graph.node[u]['vec'].append(1)

    visited, queue, dist = set(), [start], dict()
    dist[start] = 0
    while queue:
        vertex = queue.pop(0)
        if dist[vertex] <= max_depth:
            if vertex not in visited:
                visited.add(vertex)
                val = max_depth - dist[vertex]
                graph.node[vertex]['vec'][-1] = val
                next_nodes = [u for u in graph.neighbors(vertex)
                              if u not in visited]
                next_dist = dist[vertex] + 1
                for u in next_nodes:
                    dist[u] = next_dist
                queue.extend(next_nodes)
    return graph


def make_same_size(GA_orig, GB_orig):
    GA = GA_orig.copy()
    GB = GB_orig.copy()

    na = len(GA)
    nb = len(GB)
    if na > nb:
        for i in range(nb, na):
            GB.add_node(i, label='x')
    if nb > na:
        for i in range(na, nb):
            GA.add_node(i, label='x')
    return GA, GB


def trim_pairings(pairings, GA_orig, GB_orig):
    npairings = []
    for a, b in pairings:
        i, j = int(a[1:]) - 1, int(b[1:]) - 1
        if i < len(GA_orig) and j < len(GB_orig):
            npairings.append((i, j))

    return npairings


def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4):
    if len(GA_orig) > len(GB_orig):
        GA, GB = GB_orig.copy(), GA_orig.copy()
        logging.warning('Warning: reference graph is B not A')
    else:
        GA, GB = GA_orig.copy(), GB_orig.copy()
    # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig)))

    GA, GB = make_same_size(GA, GB)

    M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    # mark bfv in vec attribute
    GA, GB = init_vec(GA), init_vec(GB)
    for k in range(order):
        ds = d[:, 0]
        id_max_A = np.argsort(ds)[k]
        id_max_B = AprefB[id_max_A][0]

        GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth)
        GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth)
    # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec')

    # vectorize 2nd time with real values this time
    M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    A = ['A%d' % (i + 1) for i in range(len(GA))]
    B = ['B%d' % (i + 1) for i in range(len(GB))]

    Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA))))
    Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB))))

    rankings = Arankings
    rankings.update(Brankings)
    pairings = stable(rankings, A, B)

    # remove dummy node pairings
    npairings = trim_pairings(pairings, GA_orig, GB_orig)
    orderA, orderB = list(zip(*sorted(npairings)))
    return orderB


def _max_common_subgraph(GA, GB, pairings):
    matches = dict([(i, j) for i, j in enumerate(pairings)])
    node_ids = []
    for i, j in GA.edges():
        ii = matches[i]
        jj = matches[j]
        li = GA.node[i]['label']
        lii = GB.node[ii]['label']
        lj = GA.node[j]['label']
        ljj = GB.node[jj]['label']
        if ((ii, jj) in GB.edges() or (jj, ii) in GB.edges()) and li == lii and lj == ljj:
            node_ids.append(ii)
            node_ids.append(jj)
    G = nx.subgraph(GB, node_ids)
    cc = nx.connected_components(G)
    return cc, G


def max_common_subgraph(GA, GB, pairings):
    cc, G = _max_common_subgraph(GA, GB, pairings)
    len_ccs = [(len(ci), ci) for ci in cc]
    if not len_ccs:
        return None
    n_cc, max_cc_ids = max(len_ccs)
    max_cc = nx.subgraph(G, max_cc_ids)
    return max_cc


def max_common_subgraphs(GA, GB, pairings):
    cc, G = _max_common_subgraph(GA, GB, pairings)
    cc_graphs = [nx.subgraph(G, ci) for ci in cc]
    return cc_graphs


def compute_matching_edges_fraction(GA, GB, pairings):
    count = 0
    matches = dict([(i, j) for i, j in enumerate(pairings)])
    for i, j in GA.edges():
        ii = matches[i]
        jj = matches[j]
        if (ii, jj) in GB.edges():
            count += 1
    return float(count) / len(GA.edges())


def compute_matching_neighborhoods_fraction(GA, GB, pairings):
    count = 0
    matches = dict([(i, j) for i, j in enumerate(pairings)])
    matching_edges = defaultdict(list)
    for i, j in GA.edges():
        ii = matches[i]
        jj = matches[j]
        if (ii, jj) in GB.edges():
            matching_edges[i].append(j)
            matching_edges[j].append(i)
    for u in GA.nodes():
        if matching_edges.get(u, False):
            neighbors = nx.neighbors(GA, u)
            matches_neighborhood = True
            for v in neighbors:
                if v not in matching_edges[u]:
                    matches_neighborhood = False
                    break
            if matches_neighborhood:
                count += 1
    return float(count) / len(GA.nodes())


def compute_max_common_subgraph_size(GA, GB, pairings):
    G = max_common_subgraph(GA, GB, pairings)
    if G is None:
        return 0
    return len(G) / len(GB)


def compute_max_common_subgraphs_size(GA, GB, pairings):
    gs = max_common_subgraphs(GA, GB, pairings)
    if not gs:
        return 0
    return sum(len(g) if g else 0 for g in gs) / float(len(GB))


def compute_quality(GA, GB, pairings):
    return compute_max_common_subgraphs_size(GA, GB, pairings)


def random_optimize(GA, GB, n_iter=20):
    best_c = None
    best_order = None
    best_depth = None
    best_quality = -1
    for it in range(n_iter):
        c = random.randint(1, 7)
        order = random.randint(1, 10)
        depth = random.randint(1, 20)
        pairings = match(GA, GB, complexity=c, order=order, max_depth=depth)
        quality = compute_quality(GA, GB, pairings)
        if quality > best_quality:
            best_quality = quality
            best_c = c
            best_order = order
            best_depth = depth
    logging.debug('[random search] quality:%.2f c:%d o:%d d:%d' % (best_quality, best_c, best_order, best_depth))
    return best_quality, best_c, best_order, best_depth


def line_optimize(
        GA,
        GB,
        n_iter,
        best_c,
        best_order,
        best_depth):

    best_quality = -1
    c_range = range(8, 0, -1)
    order_range = range(10, 0, -1)
    depth_range = range(20, 0, -1)

    for it in range(n_iter):
        c = best_c
        order = best_order
        depth = best_depth
        for c in c_range:
            pairings = match(GA, GB, complexity=c, order=order, max_depth=depth)
            quality = compute_quality(GA, GB, pairings)
            if quality >= best_quality:
                best_quality = quality
                best_c = c
                best_order = order
                best_depth = depth
        c = best_c
        order = best_order
        depth = best_depth
        for order in order_range:
            pairings = match(GA, GB, complexity=c, order=order, max_depth=depth)
            quality = compute_quality(GA, GB, pairings)
            if quality >= best_quality:
                best_quality = quality
                best_c = c
                best_order = order
                best_depth = depth
        c = best_c
        order = best_order
        depth = best_depth
        for depth in depth_range:
            pairings = match(GA, GB, complexity=c, order=order, max_depth=depth)
            quality = compute_quality(GA, GB, pairings)
            if quality >= best_quality:
                best_quality = quality
                best_c = c
                best_order = order
                best_depth = depth
    logging.debug('[line search]   quality:%.2f c:%d o:%d d:%d' % (best_quality, best_c, best_order, best_depth))
    return best_quality, best_c, best_order, best_depth


def optimize(GA, GB, n_iter_random=20, n_iter_line=4):
    best_quality, best_c, best_order, best_depth = random_optimize(GA, GB, n_iter=n_iter_random)
    best_quality, best_c, best_order, best_depth = line_optimize(GA, GB, n_iter_line, best_c, best_order, best_depth)
    return best_quality, best_c, best_order, best_depth


def draw_match(GA, GB, pairings, size=10):
    G = nx.disjoint_union(GA, GB)
    for i, jj in enumerate(pairings):
        j = len(GA) + jj
        if G.node[i]['label'] == G.node[j]['label']:
            G.add_edge(i, j, label=i, nesting=True)
    draw_graph(
        G,
        size=size,
        vertex_border=False,
        vertex_size=400,
        edge_label=None,
        edge_alpha=.2,
        dark_edge_color='label',
        dark_edge_dotted=False,
        dark_edge_alpha=1,
        colormap='Set2',
        ignore_for_layout='nesting')


# -----------------------------------------------------------------------------


class GraphAligner(object):
    """Aligns graphs."""

    def __init__(self, complexity=3, order=1, max_depth=5):
        """construct."""
        self.set_params(complexity, order, max_depth)
        self.pairings = None
        self.quality = None
        self.max_common_graph = None

    def set_params(self, complexity=3, order=1, max_depth=5):
        self.complexity = complexity
        self.order = order
        self.max_depth = max_depth

    def _store(self, graph_a, graph_b):
        GA = graph_a.copy()
        GB = graph_b.copy()
        if len(GA) > len(GB):
            GA, GB = GB, GA
        self.ref_graph = GA
        self.second_ref_graph = GB

    def _optimize(self, n_iter_random=30, n_iter_line=3):
        GA, GB = self.ref_graph, self.second_ref_graph
        q, c, o, d = optimize(
            GA,
            GB,
            n_iter_random=n_iter_random,
            n_iter_line=n_iter_line)
        self.quality = q
        self.complexity = c
        self.order = o
        self.max_depth = d

    def match(self, graph_a, graph_b, n_iter_random=30, n_iter_line=3):
        self._store(graph_a, graph_b)
        self._optimize(n_iter_random, n_iter_line)
        self._match(graph_a, graph_b)
        return self

    def _match(self, graph_a, graph_b):
        self._store(graph_a, graph_b)
        GA, GB = self.ref_graph, self.second_ref_graph
        self.pairings = match(
            GA,
            GB,
            complexity=self.complexity,
            order=self.order,
            max_depth=self.max_depth)
        self.quality = compute_quality(GA, GB, self.pairings)
        return self

    def get_matching_quality(self):
        return self.quality

    def draw_match(self):
        GA, GB = self.ref_graph, self.second_ref_graph
        draw_match(GA, GB, self.pairings)
        return self

    def max_common_subgraph(self):
        GA, GB = self.ref_graph, self.second_ref_graph
        self.max_common_graph = max_common_subgraph(GA, GB, self.pairings)
        return self.max_common_graph

    def max_common_subgraphs(self):
        GA, GB = self.ref_graph, self.second_ref_graph
        self.common_graphs = max_common_subgraphs(GA, GB, self.pairings)
        return self.common_graphs