python source code of firmware

import magic
import pickle
import argparse
import IPython
from multiprocessing import Pool
from tqdm import tqdm
from firmware_slap.function_clustering import *
from firmware_slap.function_handler import get_arg_funcs, get_function_information
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
import os
import hashlib


def get_executable_files(directory, progress=True):

    executables = []
    shared_libs = []
    hashes = []

    print("[+] Reading Files")
    if not os.path.isdir(directory):
        print("[~] {} is not directory".format(directory))
        file_type = magic.from_file(directory, mime=True)

        if "application/x-sharedlib" in file_type:
            shared_libs.append(directory)
        elif "symbolic link to" in file_type and ".so" in file_type:
            shared_libs.append(directory)
        elif "application/x-executable" in file_type:
            executables.append(directory)
    total_len = len([x for x in os.walk(directory)])
    for root, subdirs, files in tqdm(os.walk(directory), total=total_len):
        for filename in files:
            full_path = os.path.join(root, filename)

            if os.path.islink(full_path):
                continue

            if os.path.isfile(full_path):
                file_type = magic.from_file(full_path, mime=True)

            file_hash = md5sum(full_path)

            if file_hash not in hashes:
                hashes.append(file_hash)

                if "application/x-sharedlib" in file_type:
                    shared_libs.append(full_path)
                elif "symbolic link to" in file_type and ".so" in file_type:
                    shared_libs.append(full_path)
                elif "application/x-executable" in file_type:
                    executables.append(full_path)

    return executables, shared_libs


def md5sum(filename):
    h = hashlib.md5()
    b = bytearray(128 * 1024)
    mv = memoryview(b)
    with open(filename, 'rb', buffering=0) as f:
        for n in iter(lambda: f.readinto(mv), 0):
            h.update(mv[:n])
    return h.hexdigest()


def get_firmware_sparse(file_list):

    all_functions = []
    m_pool = Pool()

    print("[+] Getting Function Information")
    for i in tqdm(m_pool.imap_unordered(get_sparse_file_data, file_list),
                  total=len(file_list)):
        all_functions.extend(i)
    '''
    for file_item in file_list:

        info = get_function_information(file_item)
        #arg_info = get_arg_funcs(file_item)
        #trimmed = trim_funcs(arg_info)
        trimmed = trim_funcs(info)
        all_functions.extend(trimmed)
    '''

    return all_functions


def get_sparse_file_data(file_name):

    info = get_function_information(file_name)
    #arg_info = get_arg_funcs(file_item)
    #trimmed = trim_funcs(arg_info)
    if info:
        trimmed = trim_funcs(info, file_name=file_name)
    else:
        trimmed = []
    return trimmed


def plot_clustering(all_functions, plot_it=False):
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    #svd = TruncatedSVD(random_state=2)
    #svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    #func_sparse = svd.fit_transform(func_sparse)

    scores = []
    clust_count = []
    labels = []
    for x in range(2, 50):
        result = KMeans(n_clusters=x, random_state=2).fit(func_sparse)

        score = silhouette_score(func_sparse,
                                 result.labels_,
                                 metric="cosine",
                                 random_state=2,
                                 sample_size=5000)
        scores.append(score)
        clust_count.append(x)
        labels.append(result.labels_)

        print("Clusters {:<3} | Silhoette Score : {}".format(x, score))

    largest_dif = 200
    large_index = 0
    for x in range(1, len(scores) - 2):
        if largest_dif > scores[x]:
            largest_dif = scores[x]
            large_index = x
        '''
        if largest_dif < abs(scores[x] - scores[x+1]):
                largest_dif = abs(scores[x] - scores[x+1])
                large_index = x+1
        '''
    print("Largest drop at {} with {}".format(clust_count[large_index],
                                              largest_dif))

    if plot_it:
        plt.plot(clust_count, scores)
        plt.xlabel("Cluster Centroid Count")
        plt.ylabel("Silhoette Score")
        plt.grid = True
        plt.show()

    return labels[large_index]


def single_cluster(all_functions, centroid_count=2):
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    print("Clusters {:<3} | Silhoette Score : {}".format(
        centroid_count, score))

    return result.labels_


def get_cosine_dist(all_functions):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    return cosine_distances(func_sparse, func_sparse)


def get_single_cluster(all_functions, centroid_count=2):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    #print("Clusters {:<3} | Silhoette Score : {}".format(centroid_count, score))
    return_dict['count'] = centroid_count
    return_dict['score'] = score
    return_dict['labels'] = result.labels_

    return return_dict


def remove_non_needed_functions(all_functions, remove_features=True):

    bad_prefix = ["fcn.", "sub.", "loc.", "aav.", "sym._fini", "sym._init"]

    keep_functions = []
    for function in all_functions:
        if not any(prefix in function['name'] for prefix in bad_prefix):
            keep_functions.append(function)

    if remove_features:
        bad_features = [
            'bits', 'calltype', 'maxbound', 'minbound', 'offset', 'size'
        ]
        for function in keep_functions:
            for feat in bad_features:
                try:
                    function.pop(feat)
                except:
                    pass

    return keep_functions


def test():
    parser = argparse.ArgumentParser()

    parser.add_argument("Directory")
    parser.add_argument("--Dump", "-D", help="Dump to pickle")
    parser.add_argument("--Load", "-L", help="Load from pickle")

    args = parser.parse_args()

    if args.Load:
        with open(args.Load, 'rb') as f:
            all_functions = pickle.load(f)
    else:
        executables, shared_libs = get_executable_files(args.Directory)

        all_files = executables + shared_libs

        all_functions = get_firmware_sparse(all_files)

        if args.Dump:
            with open(args.Dump, 'wb') as f:
                pickle.dump(all_functions, f, -1)

    #Fix symbols 'n stuff
    bad_prefix = ["fcn.", "sub.", "loc.", "aav.", "sym._fini", "sym._init"]
    all_functions = [
        x for x in all_functions
        if not any([y in x['name'] for y in bad_prefix])
    ]

    bad_features = [
        'bits', 'calltype', 'maxbound', 'minbound', 'offset', 'size'
    ]
    for func in all_functions:
        for feat in bad_features:
            func.pop(feat)

    func_labels = plot_clustering(all_functions, True)

    file_names = [x['file_name'] for x in all_functions]
    func_names = [x['name'] for x in all_functions]
    for x, y, z in zip(file_names, func_names, func_labels):
        x = x.split('/')[-1]
        print("{} | {} | {}".format(x, y, z))

    IPython.embed()


if __name__ == "__main__":
    test()