python source code of metadata

import time
import bisect
import logging
import weakref
import itertools
import threading
import collections

from lighthouse.util.qt import QtCore
from lighthouse.util.log import lmsg
from lighthouse.util.misc import *
from lighthouse.util.python import *
from lighthouse.util.disassembler import disassembler

from lighthouse.util.debug import catch_errors

logger = logging.getLogger("Lighthouse.Metadata")

#------------------------------------------------------------------------------
# Metadata
#------------------------------------------------------------------------------
#
#    To aid in performance, Lighthouse lifts and indexes an in-memory limited
#    representation of the disassembler's open database. This is commonly
#    referred to as 'metadata' throughout this codebase.
#
#    Once built, the lifted metadata cache stands completely independent of
#    the disassembler. This effectively eliminates the need for Lighthouse to
#    communicate with the underlying disassembler / API (which is slow) when
#    mapping coverage, or doing coverage composition logic.
#
#    With this model, we have been able to move the heavy director based
#    coverage composition logic to python-only threads without disrupting the
#    user, or IDA. (added in v0.4.0)
#
#    However, there are two main caveats of this model -
#
#    1. The cached 'metadata' representation may not always be true to state
#       of the database. For example, if the user defines/undefines functions,
#       the metadata cache will not be aware of such changes.
#
#       Lighthouse will try to update the director's metadata cache when
#       applicable, but there are instances when it will be in the best
#       interest of the user to manually trigger a refresh of the metadata.
#
#    2. Building the metadata comes with an upfront cost, but this cost has
#       been reduced as much as possible. For example, generating metadata for
#       a larger database with ~25k functions, ~725k nodes (basic blocks), and
#       ~3.4m instructions took ~27 seconds.
#
#       This will be negligible for small-medium sized databases, but will be
#       measurable for larger databases.
#
#    Ultimately, this model provides us a more responsive user experience at
#    the expense of the occasional inaccuracies that can be corrected by
#    reasonably low cost refresh.
#

#------------------------------------------------------------------------------
# Database Metadata
#------------------------------------------------------------------------------

class DatabaseMetadata(object):
    """
    Database level metadata cache.
    """

    def __init__(self, lctx=None):
        self.lctx = lctx

        # name & imagebase of the executable this metadata is based on
        self.filename = ""
        self.imagebase = BADADDR

        # database metadata cache status
        self.cached = False

        # the cache of key database structures
        self.nodes = {}
        self.functions = {}
        self.instructions = []

        # internal members to help index & navigate the cached metadata
        self._name2func = {}
        self._node2func = collections.defaultdict(list)
        self._node_addresses = []
        self._function_addresses = []

        # HACK: dirty hack since we can't create a blank node easily
        self._last_node = lambda: None
        self._last_node.instructions = []

        # create the disassembler hooks to listen for rename events
        if lctx:
            self._rename_hooks = disassembler[lctx].create_rename_hooks()
            self._rename_hooks.name_changed = self._name_changed
        else:
            self._rename_hooks = None

        # asynchronous metadata collection thread
        self._refresh_worker = None
        self._stop_threads = False
        self._go_synchronous = False

        # a scheduled callback to watch for specific database changes
        self._scheduled_interval = 2000 # ms
        self._scheduled_timer = QtCore.QTimer()
        self._scheduled_timer.setInterval(self._scheduled_interval)
        self._scheduled_timer.setSingleShot(True)
        self._scheduled_timer.timeout.connect(self._scheduled_worker)

        #----------------------------------------------------------------------
        # Callbacks
        #----------------------------------------------------------------------

        self._metadata_modified_callbacks = []
        self._function_renamed_callbacks = []
        self._rebased_callbacks = []

    #--------------------------------------------------------------------------
    # Subsystem Lifetime
    #--------------------------------------------------------------------------

    def start(self):
        """
        Start the metadata subsystem.
        """
        if self._scheduled_timer:
            self._scheduled_timer.start()

    def terminate(self):
        """
        Cleanup & terminate the metadata object.
        """
        self.abort_refresh(join=True)
        if self._rename_hooks:
            self._rename_hooks.unhook()

        # attempt to stop the scheduled callback... semi-safely :S
        if self._scheduled_timer:
            stopping = self._scheduled_timer
            self._scheduled_timer = None
            stopping.stop()

        # best effort to free up resources & improve interpreter spindown
        del self._metadata_modified_callbacks
        del self._function_renamed_callbacks
        del self._rebased_callbacks
        self._clear_cache()

    #--------------------------------------------------------------------------
    # Providers
    #--------------------------------------------------------------------------

    def get_instructions_slice(self, start_address, end_address):
        """
        Get the instructions addresses that fall within a given range.
        """
        index_start = bisect.bisect_left(self.instructions, start_address)
        index_end   = bisect.bisect_left(self.instructions, end_address)
        return self.instructions[index_start:index_end]

    def get_instruction_size(self, address):
        """
        Get the size of an instruction at a given address.

        Returns:
          -1 if undefined address (not within a basic block)
           0 if within defined instruction
           n if it is a defined instruction
        """
        node_metadata = self.get_node(address)

        #
        # if the given address does not fall within a node, we have no idea how
        # big it really is. return -1
        #

        if not node_metadata:
            return -1

        #
        # if the address falls within a node, attempt to return the size of the
        # instruction at its address. if the address is misaligned / in the
        # middle of an instruction, simply return 0
        #

        return node_metadata.instructions.get(address, 0)

    def get_node(self, address):
        """
        Get the node (basic block) metadata for a given address.
        """

        # fast path, effectively a LRU cache of 1 ;P
        if address in self._last_node.instructions:
            return self._last_node

        #
        # use the lookup lists to do a 'fuzzy' lookup of the given address,
        # locating the index of the closest known node address (rounding down)
        #

        index = bisect.bisect_right(self._node_addresses, address) - 1
        node_metadata = self.nodes.get(self._node_addresses[index], None)

        #
        # this should hit 99.9% of the time on the first index...
        #
        # but we added a fallback in the rare case when binja creates an edge
        # to an unknown/undefined instruction, whose address happens to fall
        # within a real one, thus throwing off the basic block lookup...
        #
        # technically, we could also fail going back only one block, but at
        # that point, idc, the user is looking at some weird binaries... :\
        #

        if not (node_metadata and address in node_metadata.instructions):
            node_metadata = self.nodes.get(self._node_addresses[index-1], None)

            # double fault, let's just dip...
            if not (node_metadata and address in node_metadata.instructions):
                return None

        #
        # if the selected node metadata contains the given target address, it
        # is a positive hit and we should cache this node (in last_node) for
        # faster consecutive lookups
        #

        self._last_node = node_metadata

        # return the located node_metadata
        return node_metadata

    def get_function(self, function_address):
        """
        Get the function metadata that starts at the given address.
        """
        return self.functions.get(function_address, None)

    def get_functions_containing(self, address):
        """
        Get the list of function metadata objects that contain the given address.
        """
        node_metadata = self.get_node(address)
        if not node_metadata:
            return []
        return self.get_functions_by_node(node_metadata.address)

    def get_function_by_name(self, function_name):
        """
        Get the function metadata for a given function name.
        """
        try:
            return self.functions[self._name2func[function_name]]
        except (IndexError, KeyError):
            return None

    def get_function_by_index(self, index):
        """
        Get the function metadata for a given function index.
        """
        try:
            return self.functions[self._function_addresses[index]]
        except (IndexError, KeyError):
            return None

    def get_function_index(self, address):
        """
        Get the function index for a given address.
        """
        return self._function_addresses.index(address)

    def get_functions_by_node(self, node_address):
        """
        Get the functions containing the given node.
        """
        return self._node2func.get(node_address, [])

    def get_closest_function(self, address):
        """
        Get the function metadata for the function closest to the give address.
        """

        # sanity check
        if not self._function_addresses:
            return None

        # get the closest insertion point of the given address
        index = bisect.bisect_left(self._function_addresses, address)

        # the given address is a min, return the first known function
        if index == 0:
            return self.functions[self._function_addresses[0]]

        # given address is a max, return the last known function
        if index == len(self._function_addresses):
            return self.functions[self._function_addresses[-1]]

        # select the two candidate addresses
        before = self._function_addresses[index - 1]
        after  = self._function_addresses[index]

        # return the function closest to the given address
        if after - address < address - before:
            return self.functions[after]
        else:
            return self.functions[before]

    def is_big(self):
        """
        Return a bool indicating whether we think the database is 'big'.
        """
        return len(self.functions) > 50000

    #--------------------------------------------------------------------------
    # Refresh
    #--------------------------------------------------------------------------

    def refresh(self, progress_callback=None):
        """
        Refresh the database metadata cache.
        """
        self._refresh(progress_callback)

    def refresh_async(self, progress_callback=None, force=False):
        """
        Refresh the database metadata cache asynchronously.

        Returns a future (Queue) that will carry the completion message.
        """
        assert self._refresh_worker == None, 'Refresh already running'
        result_queue = queue.Queue()

        #
        # if there is already metadata cached for this disassembler session,
        # ignore a request to refresh it unless forced
        #

        if self.cached and not force:
            result_queue.put(False)
            return result_queue

        #
        # reset the async abort and go_synchronous flags so that we can use them
        # for this new refresh if needed
        #

        self._stop_threads = False
        self._go_synchronous = False

        #
        # kick off an asynchronous metadata collection task
        #

        self._refresh_worker = threading.Thread(
            target=self._refresh_async,
            args=(result_queue, progress_callback,)
        )
        self._refresh_worker.start()

        #
        # immediately return a queue to the caller which it can use to listen
        # on and wait for a refresh completion message
        #

        return result_queue

    def abort_refresh(self, join=False):
        """
        Abort an asynchronous refresh.

        To guarantee an asynchronous refresh has been canceled, the caller can
        optionally wait for the result_queue from refresh() to return 'None'.

        Alternatively, the `join` parameter can be set to `True`, making this
        function block until the refresh is canceled.
        """

        #
        # the refresh worker (if it exists) can be ripped away at any time.
        # take a local reference to avoid a double fetch problems
        #

        worker = self._refresh_worker

        #
        # if there is no worker present or running (cleaning up?) there is
        # nothing for us to abort. Simply reset the abort flag (just in case)
        # and return immediately
        #

        if not (worker and worker.is_alive()):
            self._stop_threads = False
            self._refresh_worker = None
            return

        # signal the worker thread to stop
        self._stop_threads = True

        # if requested, don't return until the worker thread has stopped...
        if join:
            worker.join()

    def _refresh_instructions(self):
        """
        Refresh the list of database instructions (from function metadata).
        """
        instructions = []
        for function_metadata in itervalues(self.functions):
            instructions.append(function_metadata.instructions)
        instructions = list(set(itertools.chain.from_iterable(instructions)))
        instructions.sort()

        # commit the updated instruction list
        self.instructions = instructions

    def _refresh_lookup(self):
        """
        Refresh the internal fast lookup address lists.

        Fast lookup lists are simply sorted address lists of function metadata,
        node metadata, or possibly other forms of metadata (in the future). We
        create sorted lists of metadata object addresses so that we can use them
        for fast, fuzzy address lookup (eg, bisect).

         c.f:
          - get_node(ea)
          - get_function(ea)

        """
        self._last_node = lambda: None # XXX blank node hack, see other ref to _last_node
        self._last_node.instructions = []
        self._name2func = { f.name: f.address for f in itervalues(self.functions) }
        self._node_addresses = sorted(self.nodes.keys())
        self._function_addresses = sorted(self.functions.keys())
        for function_metadata in itervalues(self.functions):
            for node_address in function_metadata.nodes:
                self._node2func[node_address].append(function_metadata)

    def go_synchronous(self):
        """
        Switch an ongoing async refresh into a synchronous one.

        This will make it go ... significantly faster ... but cannot be interrupted.
        """
        self._go_synchronous = True

    #--------------------------------------------------------------------------
    # Metadata Collection
    #--------------------------------------------------------------------------

    @not_mainthread
    def _refresh_async(self, result_queue, progress_callback=None):
        """
        Internal thread worker routine to refresh the database metadata asynchronously.
        """

        # start an interruptable refresh
        completed = self._refresh(progress_callback, True)

        # clean up our thread's reference as it is basically done/dead
        self._refresh_worker = None

        # send the refresh result (good/bad) incase anyone is still listening
        result_queue.put(completed)

        # exit thread...

    def _clear_cache(self):
        """
        Cleare the metadata cache of all collected info.
        """
        self.nodes = {}
        self.functions = {}
        self.instructions = []
        self._node2func = collections.defaultdict(list)
        self._refresh_lookup()
        self.cached = False

    def _refresh(self, progress_callback=None, is_async=False):
        """
        Internal routine that will update the database metadata cache.
        """
        self._clear_cache()

        # pause our rename listening hooks (more performant collection)
        if self._rename_hooks:
            self._rename_hooks.unhook()

        # grab the cached imagebase as it might have changed
        prev_imagebase = self.imagebase

        # refresh high level database properties that we wish to cache
        self._sync_refresh_properties()

        #
        # we will perform a complete metadata refresh of all database defined
        # functions. let's retrieve that list from the disassembler now...
        #

        disassembler_ctx = disassembler[self.lctx]
        function_addresses = disassembler.execute_read(disassembler_ctx.get_function_addresses)()
        total = len(function_addresses)

        start = time.time()
        #----------------------------------------------------------------------

        # refresh the core database metadata asynchronously
        if is_async and self._async_collect_metadata(function_addresses, progress_callback):
            self._clear_cache()
            return False

        # refresh the core database metadata synchronously
        completed = total - len(function_addresses)
        self._sync_collect_metadata(function_addresses, progress_callback, completed)

        #----------------------------------------------------------------------
        end = time.time()
        logger.debug("Metadata collection took %s seconds" % (end - start))

        # regenerate the instruction list from collected metadata
        self._refresh_instructions()

        # refresh the internal function/node fast lookup lists
        self._refresh_lookup()

        #----------------------------------------------------------------------

        # reinstall the rename listener hooks now that the refresh is done
        self._rename_hooks.hook()

        # the metadata refresh is effectively done, and the data is now 'cached'
        self.cached = True

        # detect & notify of a rebase event
        if prev_imagebase != BADADDR and prev_imagebase != self.imagebase:
            self._notify_rebased(prev_imagebase, self.imagebase)

        # return true/false to indicates completion
        return True

    @disassembler.execute_read
    def _sync_refresh_properties(self):
        """
        Refresh a selection of interesting database properties.
        """
        disassembler_ctx = disassembler[self.lctx]
        self.filename = disassembler_ctx.get_root_filename()
        self.imagebase = disassembler_ctx.get_imagebase()

    @disassembler.execute_read
    def _sync_collect_metadata(self, function_addresses, progress_callback, progress_base=0):
        """
        Collect metadata from the underlying database.
        """
        CHUNK_SIZE = 500
        completed = progress_base
        total = progress_base + len(function_addresses)
        logger.debug("Refreshing synchronously from %u/%u" % (completed, total))

        while function_addresses:

            # split off a chunk of functions to process metadata for
            addresses_chunk = function_addresses[:CHUNK_SIZE]
            del function_addresses[:CHUNK_SIZE]

            # collect metadata from the database
            self._cache_functions(addresses_chunk)

            # report incremental progress to an optional progress_callback
            if progress_callback:
                completed += CHUNK_SIZE if function_addresses else len(addresses_chunk)
                progress_callback(completed, total)

    @not_mainthread
    def _async_collect_metadata(self, function_addresses, progress_callback):
        """
        Collect metadata from the underlying database asynchronously (interruptable).
        """
        CHUNK_SIZE = 150
        completed = 0
        total = len(function_addresses)
        logger.debug("Refreshing asynchronously from %u/%u" % (completed, total))

        while function_addresses:

            #
            # here we will split off CHUNK_SIZE elements from the function
            # addresses list, in-place. this allows the list to keep track of
            # what has not been processed, such that the caller can continue
            # to operate on it if needed
            #

            addresses_chunk = function_addresses[:CHUNK_SIZE]
            del function_addresses[:CHUNK_SIZE]

            # collect metadata from the database
            self._async_cache_functions(addresses_chunk)

            # report incremental progress to an optional progress_callback
            if progress_callback:
                completed += CHUNK_SIZE if function_addresses else len(addresses_chunk)
                progress_callback(completed, total)

            # if the refresh was canceled, stop collecting metadata and bail
            if self._stop_threads:
                logger.debug("Async metadata collection is bailing!")
                return True

            # ALL SYSTEMS GO!!
            if self._go_synchronous:
                break

            # sleep some so we don't choke the mainthread
            time.sleep(.015)

        # the refresh either completed, or it is going synchronous!
        return False

    @disassembler.execute_read
    def _async_cache_functions(self, addresses_chunk):
        """
        Wrapped version of self._cache_functions, safe for use from an async worker thread.
        """
        self._cache_functions(addresses_chunk)

    @catch_errors
    def _cache_functions(self, addresses_chunk):
        """
        Lift and cache function metadata for the given list of function addresses.
        """
        disassembler_ctx = disassembler[self.lctx]

        for address in addresses_chunk:

            # attempt to 'lift' the function from the database
            try:
                function_metadata = FunctionMetadata(address, disassembler_ctx)

            #
            # this is not exactly a good thing but it indicates that the
            # disassembler didn't see the a function that we thought should
            # have been there based on what it told us previously...
            #
            # this means the database might have changed, while the refresh
            # was running. it's not the end of the world, but it might mean
            # the cache will not be fully accurate...
            #

            except Exception:
                lmsg(" - Caching function at 0x%08X failed..." % address)
                logger.exception("FunctionMetadata Error:")
                continue

            # add the updated info
            self.nodes.update(function_metadata.nodes)
            self.functions[address] = function_metadata

    #--------------------------------------------------------------------------
    # Signal Handlers
    #--------------------------------------------------------------------------

    def _name_changed(self, address, new_name):
        """
        Handle function rename event.
        """
        function = self.get_function(address)
        if not (function and function.address == address):
            return

        # if the name isn't actually changing (misfire?) nothing to do
        if new_name == function.name:
            return

        logger.debug("Name changing @ 0x%X" % address)
        logger.debug("  Old name: %s" % function.name.encode("utf-8"))
        logger.debug("  New name: %s" % new_name.encode("utf-8"))

        # update the function name in the cached lookup & rename it for real
        self._name2func[new_name] = self._name2func.pop(function.name)
        function.name = new_name

        # notify metadata listeners of the rename event
        self._notify_function_renamed()

    #--------------------------------------------------------------------------
    # Callbacks
    #--------------------------------------------------------------------------

    def metadata_modified(self, callback):
        """
        Subscribe a callback for metadata modification events.
        """
        register_callback(self._metadata_modified_callbacks, callback)

    def _notify_metadata_modified(self):
        """
        Notify listeners of a metadata modification event.
        """
        notify_callback(self._metadata_modified_callbacks)

    def function_renamed(self, callback):
        """
        Subscribe a callback for function rename events.
        """
        register_callback(self._function_renamed_callbacks, callback)

    def _notify_function_renamed(self):
        """
        Notify listeners of a function rename event.
        """
        notify_callback(self._function_renamed_callbacks)

    def rebased(self, callback):
        """
        Subscribe a callback for director rebasing events.
        """
        register_callback(self._rebased_callbacks, callback)

    def _notify_rebased(self, old_imagebase, new_imagebase):
        """
        Notify listeners of a database rebasing event.
        """
        notify_callback(self._rebased_callbacks)

    #--------------------------------------------------------------------------
    # Scheduled
    #--------------------------------------------------------------------------

    @disassembler.execute_read
    def _scheduled_worker(self):
        """
        A timed callback to watch for metadata-relevant database changes.
        """
        logger.debug("In timed metadata callback...")
        disassembler_ctx = disassembler[self.lctx]

        # watch for rebase events
        current_imagebase = disassembler_ctx.get_imagebase()
        if (self.cached and current_imagebase != self.imagebase):

            # only attempt a rebase if the disassembler seems idle...
            if not disassembler_ctx.busy:
                lmsg("Rebasing Lighthouse (0x%X --> 0x%X)" % (self.imagebase, current_imagebase))
                self.lctx.director.refresh()

        # schedule the next update (ms)
        if self._scheduled_timer:
            self._scheduled_timer.start(self._scheduled_interval)

#------------------------------------------------------------------------------
# Function Metadata
#------------------------------------------------------------------------------

class FunctionMetadata(object):
    """
    Function level metadata cache.
    """

    def __init__(self, address, disassembler_ctx=None):

        # function metadata
        self.address = address
        self.name = None

        # node metadata
        self.nodes = {}
        self.edges = collections.defaultdict(list)

        # fixed/baked/computed metrics
        self.size = 0
        self.node_count = 0
        self.edge_count = 0
        self.instruction_count = 0
        self.cyclomatic_complexity = 0

        # collect metdata from the underlying database
        self._cache_function(disassembler_ctx)

    #--------------------------------------------------------------------------
    # Properties
    #--------------------------------------------------------------------------

    @property
    def instructions(self):
        """
        Return the instruction addresses in this function.
        """
        return set(itertools.chain.from_iterable([node.instructions for node in itervalues(self.nodes)]))

    @property
    def empty(self):
        """
        Return a bool indicating whether the object is populated.
        """
        return self.size == 0

    #--------------------------------------------------------------------------
    # Metadata Population
    #--------------------------------------------------------------------------

    def _cache_function(self, disassembler_ctx):
        """
        Collect function metadata from the underlying database.
        """
        self.name = disassembler_ctx.get_function_name_at(self.address)
        self._refresh_nodes(disassembler_ctx)
        self._finalize()

    def _refresh_nodes(self, disassembler_ctx):
        """
        This will be replaced with a disassembler-specific function at runtime.

        NOTE: Read the 'MONKEY PATCHING' section at the end of this file.
        """
        raise RuntimeError("This function should have been monkey patched...")

    def _ida_refresh_nodes(self, _):
        """
        Refresh function node metadata against an open IDA database.
        """
        function_metadata = self
        function_metadata.nodes = {}

        # get function & flowchart object from IDA database
        function  = idaapi.get_func(self.address)
        flowchart = idaapi.qflow_chart_t("", function, idaapi.BADADDR, idaapi.BADADDR, 0)

        #
        # now we will walk the flowchart for this function, collecting
        # information on each of its nodes (basic blocks) and populating
        # the function & node metadata objects.
        #

        for node_id in xrange(flowchart.size()):
            node = flowchart[node_id]

            #
            # the node current node appears to have a size of zero. This means
            # that another flowchart / function owns this node so we can just
            # ignore it...
            #

            if node.start_ea == node.end_ea:
                continue

            # create a new metadata object for this node
            node_metadata = NodeMetadata(node.start_ea, node.end_ea, node_id)

            #
            # establish a relationship between this node (basic block) and
            # this function metadata (its parent)
            #

            function_metadata.nodes[node.start_ea] = node_metadata

        # compute all of the edges between nodes in the current function
        for node_metadata in itervalues(function_metadata.nodes):
            edge_src = node_metadata.edge_out
            for edge_dst in idautils.CodeRefsFrom(edge_src, True):
                if edge_dst in function_metadata.nodes:
                    function_metadata.edges[edge_src].append(edge_dst)

    def _binja_refresh_nodes(self, disassembler_ctx):
        """
        Refresh function node metadata against an open Binary Ninja database.
        """
        function_metadata = self
        function_metadata.nodes = {}
        bv = disassembler_ctx.bv
        count = ctypes.c_ulonglong(0)

        # get the function from the Binja database
        function = bv.get_function_at(self.address)

        #
        # now we will walk the flowchart for this function, collecting
        # information on each of its nodes (basic blocks) and populating
        # the function & node metadata objects.
        #

        for node in function.basic_blocks:

            # create a new metadata object for this node
            node_metadata = NodeMetadata(node.start, node.end, node.index, disassembler_ctx)

            #
            # establish a relationship between this node (basic block) and
            # this function metadata (its parent)
            #

            function_metadata.nodes[node.start] = node_metadata

            #
            # enumerate the edges produced by this node (basic block) with a
            # destination that falls within this function.
            #

            edge_src = node_metadata.edge_out

            count.value = 0
            edges = core.BNGetBasicBlockOutgoingEdges(node.handle, count)

            for i in range(0, count.value):
                if edges[i].target:
                    function_metadata.edges[edge_src].append(node._create_instance(BNNewBasicBlockReference(edges[i].target), bv).start)
            core.BNFreeBasicBlockEdgeList(edges, count.value)

            # NOTE/PERF ~28% of metadata collection time alone...
            #for edge in node.outgoing_edges:
            #    function_metadata.edges[edge_src].append(edge.target.start)

    def _compute_complexity(self):
        """
        Walk the function CFG to determine approximate cyclomatic complexity.

        The purpose of this function is mostly to account for IDA's inclusion
        of additional floating nodes in function flowcharts. These blocks tend
        to be for exception handlers, but can manifest in various other cases.

        By walking the function CFG, we can identify these 'disembodied'
        blocks that have no incoming edge and ignore them in our cyclomatic
        complexity calculation. Not doing so will radically throw off the
        cyclomatic complexity score.
        """
        confirmed_nodes = set()
        confirmed_edges = {}

        #
        # to_walk contains a list of node addresses. we draw from this list
        # one at a time, walking across all of the outgoing edges from the
        # current node (node_address) to walk the function graph
        #

        to_walk = set([self.address]) if self.nodes else set()
        while to_walk:

            # this is the address of the node we will 'walk' from
            node_address = to_walk.pop()
            confirmed_nodes.add(node_address)

            # now we loop through all edges that originate from this block
            current_src = self.nodes[node_address].edge_out
            for current_dest in self.edges[current_src]:

                # ignore nodes we have already visited
                if current_dest in confirmed_nodes:
                    continue

                #
                # it appears that this node has not been visited yet, so we
                # will want to walk its edges sometime soon to continue the
                # graph exploration
                #

                to_walk.add(current_dest)

            # update the map of confirmed (walked) edges
            confirmed_edges[current_src] = self.edges.pop(current_src)

        # compute the final cyclomatic complexity for the function
        num_edges = sum(len(x) for x in itervalues(confirmed_edges))
        num_nodes = len(confirmed_nodes)
        return num_edges - num_nodes + 2

    def _finalize(self):
        """
        Finalize function metadata for use.
        """
        self.size = sum(node.size for node in itervalues(self.nodes))
        self.node_count = len(self.nodes)
        self.edge_count = len(self.edges)
        self.instruction_count = sum(node.instruction_count for node in itervalues(self.nodes))
        self.cyclomatic_complexity = self._compute_complexity()

    #--------------------------------------------------------------------------
    # Operator Overloads
    #--------------------------------------------------------------------------

    def __eq__(self, other):
        """
        Compute function metadata equality (==)
        """
        result = True
        result &= self.name == other.name
        result &= self.size == other.size
        result &= self.address == other.address
        result &= self.node_count == other.node_count
        result &= self.instruction_count == other.instruction_count
        result &= viewkeys(self.nodes) == viewkeys(other.nodes)
        return result

#------------------------------------------------------------------------------
# Node Metadata
#------------------------------------------------------------------------------

class NodeMetadata(object):
    """
    Node (basic block) level metadata cache.
    """

    def __init__(self, start_ea, end_ea, node_id=None, disassembler_ctx=None):

        # node metadata
        self.size = end_ea - start_ea
        self.address = start_ea
        self.instruction_count = 0
        self.edge_out = -1

        # flowchart node_id
        self.id = node_id

        # instruction addresses
        self.instructions = {}

        #----------------------------------------------------------------------

        # collect metadata from the underlying database
        self._cache_node(disassembler_ctx)

    #--------------------------------------------------------------------------
    # Metadata Population
    #--------------------------------------------------------------------------

    def _cache_node(self, disassembler_ctx):
        """
        This will be replaced with a disassembler-specific function at runtime.

        NOTE: Read the 'MONKEY PATCHING' section at the end of this file.
        """
        raise RuntimeError("This function should have been monkey patched...")

    def _ida_cache_node(self, _):
        """
        Collect node metadata from the underlying database.
        """
        current_address = self.address
        node_end = self.address + self.size

        #
        # loop through the node's entire address range and count its
        # instructions. Note that we are assuming that every defined
        # 'head' (in IDA) is an instruction
        #

        while current_address < node_end:
            instruction_size = get_item_end(current_address) - current_address
            self.instructions[current_address] = instruction_size
            current_address += instruction_size

        # the source of the outward edge
        self.edge_out = current_address - instruction_size

        # save the number of instructions in this block
        self.instruction_count = len(self.instructions)

    def _binja_cache_node(self, disassembler_ctx):
        """
        Collect node metadata from the underlying database.
        """
        current_address = self.address
        node_end = self.address + self.size

        # NOTE/PERF: gotta go fast :D
        bh = disassembler_ctx.bv.handle
        ah = disassembler_ctx.bv.arch.handle

        #
        # Note that we 'iterate over' the instructions using their byte length
        # because it is far more performant than Binary Ninja's instruction
        # generators which also produce instruction text, tokens etc...
        #

        while current_address < node_end:
            instruction_size = BNGetInstructionLength(bh, ah, current_address) or 1
            self.instructions[current_address] = instruction_size
            current_address += instruction_size

        # the source of the outward edge
        self.edge_out = current_address - instruction_size

        # save the number of instructions in this block
        self.instruction_count = len(self.instructions)

    #--------------------------------------------------------------------------
    # Operator Overloads
    #--------------------------------------------------------------------------

    def __str__(self):
        """
        Printable NodeMetadata.
        """
        output  = ""
        output += "Node 0x%08X Info:\n" % self.address
        output += " Address: 0x%08X\n" % self.address
        output += " Size: %u\n" % self.size
        output += " Instruction Count: %u\n" % self.instruction_count
        output += " Id: %u\n" % self.id
        output += " Instructions: %s" % self.instructions
        return output

    def __contains__(self, address):
        """
        Overload python's 'in' keyword for this object.

        This allows us to use `in` to check if an address falls within a node.
        """
        if self.address <= address < self.address + self.size:
            return True
        return False

    def __eq__(self, other):
        """
        Compute node equality (==)
        """
        result = True
        result &= self.size == other.size
        result &= self.address == other.address
        result &= self.instruction_count == other.instruction_count
        result &= self.id == other.id
        return result

#------------------------------------------------------------------------------
# Async Metadata Helpers
#------------------------------------------------------------------------------

@disassembler.execute_ui
def metadata_progress(completed, total):
    """
    Handler for metadata collection callback, updates progress dialog.
    """
    disassembler.replace_wait_box(
        "Collected metadata for %u/%u Functions" % (completed, total)
    )

#------------------------------------------------------------------------------
# MONKEY PATCHING
#------------------------------------------------------------------------------
#
#   We use 'monkey patching' to modify the Metadata class definitions at
#   runtime. Specifically, we use it to swap in metadata collection routines
#   that have been carefully tailored for a given disassembler.
#
#   The reason for this is that the metadata collection code is very
#   disassembler-specific, and that it needs to be as performant as possible.
#   Shimming metadata collection code to be disassembler agnostic is going
#   to be messy and slow.
#

if disassembler.NAME == "IDA":
    import idaapi
    import idautils
    FunctionMetadata._refresh_nodes = FunctionMetadata._ida_refresh_nodes
    NodeMetadata._cache_node = NodeMetadata._ida_cache_node

    # pull hot funcs out of module for faster access... (perf)
    from idaapi import get_item_end

elif disassembler.NAME == "BINJA":
    import ctypes
    import binaryninja
    from binaryninja import core
    FunctionMetadata._refresh_nodes = FunctionMetadata._binja_refresh_nodes
    NodeMetadata._cache_node = NodeMetadata._binja_cache_node

    # pull hot funcs out of module for faster access... (perf)
    BNGetInstructionLength = core.BNGetInstructionLength
    BNNewBasicBlockReference = core.BNNewBasicBlockReference

else:
    raise NotImplementedError("DISASSEMBLER-SPECIFIC SHIM MISSING")