python source code of pefeatures

# -*- coding: utf-8 -*-
''' Extracts some basic features from PE files. Many of the features 
implemented have been used in previously published works. For more information,
check out the following resources:
* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf

It may be useful to do feature selection to reduce this set of features to a meaningful set
for your modeling problem.
'''
import lief  # pip install https://github.com/lief-project/LIEF/releases/download/0.7.0/linux_lief-0.7.0_py3.6.tar.gz
# see https://github.com/lief-project/LIEF/releases

import numpy as np
# FeatureHasher(n_features=10).transform( [ {k:v}, {k:v}])
from sklearn.feature_extraction import FeatureHasher

import re


class FeatureType(object):
    '''Base class from which each feature type may inherit'''

    def __init__(self):
        super().__init__()
        self.dim = 0
        self.dtype = np.float32
        self.name = ''

    def __call__(self, arg):
        raise(NotImplemented)

    def empty(self):
        return np.zeros((self.dim,), dtype=self.dtype)

    def __repr__(self):
        return '{}({})'.format(self.name, self.dim)


class ByteHistogram(FeatureType):
    ''' Byte histogram (normalized to sum to unity) over the entire binary file.'''

    def __init__(self):
        super().__init__()
        self.dim = 1 + 256
        self.name = 'ByteHistogram'

    def __call__(self, bytez):
        h = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
        return np.concatenate([
            [h.sum()],  # total size of the byte stream
            h.astype(self.dtype).flatten() / h.sum(),  # normalized the histogram
        ])


class ByteEntropyHistogram(FeatureType):
    ''' 2d byte/entropy histogram based roughly on (Saxe and Berlin, 2015).
    This roughly approximates the joint probability of byte value and local entropy.
    See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
    '''

    def __init__(self, step=1024, window=2048):
        super().__init__()
        self.dim = 256
        self.name = 'ByteEntropyHistogram'
        self.window = window
        self.step = step

    def _entropy_bin_counts(self, block):
        # coarse histogram, 16 bytes per bin
        c = np.bincount(block >> 4, minlength=16) # 16-bin histogram
        p = c.astype(np.float32) / self.window
        wh = np.where(c)[0]
        H = np.sum(-p[wh] * np.log2(p[wh]))*2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)

        Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
        if Hbin == 16:  # handle entropy = 8.0 bits
            Hbin = 15

        return Hbin, c

    def __call__(self, bytez):
        output = np.zeros((16, 16), dtype=np.int)
        a = np.frombuffer(bytez, dtype=np.uint8)
        if a.shape[0] < self.window:
            Hbin, c = self._entropy_bin_counts(a)
            output[Hbin, :] += c
        else:
            # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
            shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
            strides = a.strides + (a.strides[-1],)
            blocks = np.lib.stride_tricks.as_strided(
                a, shape=shape, strides=strides)[::self.step, :]

            # from the blocks, compute histogram
            for block in blocks:
                Hbin, c = self._entropy_bin_counts(block)
                output[Hbin, :] += c

        return output.flatten().astype(self.dtype)


class SectionInfo(FeatureType):
    '''Information about section names, sizes and entropy.  Uses hashing trick
    to summarize all this section info into a feature vector.
    '''

    def __init__(self):
        super().__init__()
        # sum of the vector sizes comprising this feature
        self.dim = 5 + 50 + 50 + 50 + 50 + 50
        self.name = 'SectionInfo'

    def __call__(self, binary):
        # general statistics about sections
        general = [len(binary.sections),                         # total number of sections
                   # number of sections with nonzero size
                   sum(1 for s in binary.sections if s.size == 0),
                   # number of sections with an empty name
                   sum(1 for s in binary.sections if s.name == ""),
                   sum(1 for s in binary.sections if s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_READ)
                       and s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE)),  # number of RX
                   sum(1 for s in binary.sections if s.has_characteristic(
                       lief.PE.SECTION_CHARACTERISTICS.MEM_WRITE)),  # number of W
                   ]

        # gross characteristics of each section
        section_sizes = [(s.name, len(s.content)) for s in binary.sections]
        section_entropy = [(s.name, s.entropy) for s in binary.sections]
        section_vsize = [(s.name, s.virtual_size) for s in binary.sections]

        # properties of entry point, or if invalid, the first executable section
        try:
            entry = binary.section_from_offset(binary.entrypoint)
        except lief.not_found:
            # bad entry point, let's find the first executable section
            entry = None
            for s in binary.sections:
                if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
                    entry = s
                    break
        if entry is not None:
            entry_name = [entry.name]
            entry_characteristics = [str(c)
                                     for c in entry.characteristics_lists]
            # ['SECTION_CHARACTERISTICS.CNT_CODE', 'SECTION_CHARACTERISTICS.MEM_EXECUTE','SECTION_CHARACTERISTICS.MEM_READ']
        else:
            entry_name = []
            entry_characteristics = []

        # let's dump all this info into a single vector
        return np.concatenate([
            np.atleast_2d(np.asarray(general, dtype=self.dtype)),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_sizes]).toarray(),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_entropy]).toarray(),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_vsize]).toarray(),
            FeatureHasher(50, input_type="string", dtype=self.dtype).transform(
                [entry_name]).toarray(),
            FeatureHasher(50, input_type="string", dtype=self.dtype).transform([entry_characteristics]).toarray()
            ], axis=-1).flatten().astype(self.dtype)


class ImportsInfo(FeatureType):
    '''Information about imported libraries and functions from the 
    import address table.  Note that the total number of imported
    functions is contained in GeneralFileInfo.
    '''

    def __init__(self):
        super().__init__()
        self.dim = 256 + 1024
        self.name = 'ImportsInfo'

    def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype)


class ExportsInfo(FeatureType):
    '''Information about exported functions. Note that the total number of exported
    functions is contained in GeneralFileInfo.
    '''

    def __init__(self):
        super().__init__()
        self.dim = 128
        self.name = 'ExportsInfo'

    def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)


class GeneralFileInfo(FeatureType):
    '''General information about the file.'''

    def __init__(self):
        super().__init__()
        self.dim = 9
        self.name = 'GeneralFileInfo'

    def __call__(self, binary):
        return np.asarray([
            binary.virtual_size,
            binary.has_debug,
            len(binary.exported_functions),
            len(binary.imported_functions),
            binary.has_relocations,
            binary.has_resources,
            binary.has_signature,
            binary.has_tls,
            len(binary.symbols),
        ]).flatten().astype(self.dtype)


class HeaderFileInfo(FeatureType):
    '''Machine, architecure, OS, linker and other information extracted from header.'''

    def __init__(self):
        super().__init__()
        self.dim = 62
        self.name = 'HeaderFileInfo'

    def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype)


class StringExtractor(FeatureType):
    ''' Extracts strings from raw byte stream '''

    def __init__(self):
        super().__init__()
        self.dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1
        self.name = 'StringExtractor'
        # all consecutive runs of 0x20 - 0x7f that are 5+ characters
        self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
        # occurances of the string 'C:\'.  Not actually extracting the path
        self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
        # occurances of http:// or https://.  Not actually extracting the URLs
        self._urls = re.compile(b'https?://', re.IGNORECASE)
        # occurances of the string prefix HKEY_.  No actually extracting registry names
        self._registry = re.compile(b'HKEY_')
        # crude evidence of an MZ header (dropper?) somewhere in the byte stream
        self._mz = re.compile(b'MZ')

    def __call__(self, bytez):
        allstrings = self._allstrings.findall(bytez)
        if allstrings:
            # statistics about strings:
            string_lengths = [len(s) for s in allstrings]
            avlength = sum(string_lengths) / len(string_lengths)
            # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
            as_shifted_string = [b - ord(b'\x20')
                                 for b in b''.join(allstrings)]
            c = np.bincount(as_shifted_string, minlength=96)  # histogram count
            # distribution of characters in printable strings
            p = c.astype(np.float32) / c.sum()
            wh = np.where(c)[0]
            H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
        else:
            avlength = 0
            p = np.zeros((96,), dtype=np.float32)
            H = 0

        return np.concatenate([
            [[len(allstrings)]],
            [[avlength]],
            [p.tolist()],
            [[H]],
            [[len(self._paths.findall(bytez))]],
            [[len(self._urls.findall(bytez))]],
            [[len(self._registry.findall(bytez))]],
            [[len(self._mz.findall(bytez))]]
        ], axis=-1).flatten().astype(self.dtype)


class PEFeatureExtractor(object):
    ''' Extract useful features from a PE file, and return as a vector
        of fixed size. 
    '''

    def __init__(self):
        # features come in 2 types: those that are extracted from the raw byte stream, and those that require parsing of the PE file
        self.raw_features = [
            ByteHistogram(), 
            ByteEntropyHistogram(), 
            StringExtractor()
        ]

        self.parsed_features = [
            GeneralFileInfo(), 
            HeaderFileInfo(), 
            SectionInfo(), 
            ImportsInfo(), 
            ExportsInfo()
        ]
        self.dim = sum(o.dim for o in self.raw_features) + \
            sum(o.dim for o in self.parsed_features)

    def extract(self, bytez):
        # feature vectors that require only raw bytez
        featurevectors = [fe(bytez) for fe in self.raw_features]

        # feature vectors that require a parsed file
        try:
            binary = lief.PE.parse(list(bytez))
        except (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error,RuntimeError):
            # some kind of parsing problem, none of these feature extractors will work
            binary = None
            featurevectors.extend([fe.empty() for fe in self.parsed_features])
        # except: # everything else (KeyboardInterrupt, SystemExit, ValueError):
        #     raise

        if binary is not None:
            for fe in self.parsed_features:
                try:
                    featurevectors.append(fe(binary))
                except(KeyboardInterrupt, SystemExit):
                    raise
                except:
                    # some property was invalid or missing
                    featurevectors.append(fe.empty())

        return np.concatenate(featurevectors)

    def test(self, bytez):
        for fe in self.raw_features:
            print(fe.name)
            v = fe(bytez)
            assert len(v) == fe.dim, 'length of {} different than expecting!'

        binary = lief.PE.parse(list(bytez))
        for fe in self.parsed_features:
            print(fe.name)
            v = fe(binary)
            assert len(v) == fe.dim, 'length of {} different than expecting!'