python source code of droidutil

import os
import errno
import re
import shutil
import magic
import xml.dom.minidom
import hashlib
from collections import defaultdict

"""Those are my own utilities for sample analysis"""

def mkdir_if_necessary(path):
    """Creates the directory if it does not exist yet. 
    If it exists, does not do anything.
    If path is None (not filled), does not do anything."""

    if path != None:
        try:
            os.makedirs(path)
        except OSError as exc: # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else: raise

def on_rm_tree_error(fn, path, exc_info):
    """
    Error handler for ``shutil.rmtree``.

    rmtree fails in particular if the file to delete is read-only.
    to remove, we attempt to set all permissions and then retry.

    Usage : ``shutil.rmtree(path, onerror=onerror)``
    """
    if fn is os.rmdir:
        os.chmod(path, 777)
        os.rmdir(path)
    elif fn is os.remove:
        os.chmod(path, 777)
        os.remove(path)

def move_dir(src,dst):
        """Move src directory to dst - works even if dst already exists."""
        assert os.path.isdir(src), "src must be an existing directory"
        os.system ("mv"+ " " + src + "/* " + dst)
        shutil.rmtree(src, onerror=on_rm_tree_error)

def sanitize_filename(filename):
    """Sanitizes a filename so that we can create the output analysis directory without any problem.
    We need to consider we might have filenames with Russian or Chinese characters. 
    
    filename is only the 'basename' not an absolute path

    Returns the sanitized name."""
    # we remove any character which is not letters, numbers, _ or .
    return re.sub('[^a-zA-Z0-9_\.]','', filename)

def listAll(dirName):
    filelist1=[]
    files = os.listdir(dirName)
    for f in files:
        if os.path.isfile(os.path.join(dirName,f)):
            filelist1.append(os.path.join(dirName,f))
        else:
            newlist=listAll(os.path.join(dirName,f));
            filelist1.extend(newlist)           
    return filelist1 



def count_filedirs(dirname):
    """Counts the number of directories and files in a given directory. Counts recursively.
    dirname must be readable.
    Returns:
    nb of directories
    nb of files
    
    This is somewhat the equivalent of: find ./smali -type d -print 
    or -type f
    """
    assert os.access(dirname, os.R_OK), "Can't access directory: "+dirname
    
    dirs = [name for name in os.listdir(dirname) if os.path.isdir(os.path.join(dirname, name))]
    nb_dirs = len(dirs)
    nb_files = len([name for name in os.listdir(dirname) if os.path.isfile(os.path.join(dirname, name))])
    for element in dirs:
        try:
            element_dirs, element_files = count_filedirs(os.path.join(dirname, element))
        except RuntimeError:
            # occurs when too many recursive dir
            element_dirs = 0
            element_files = 0
        nb_dirs += element_dirs
        nb_files += element_files

    return nb_dirs, nb_files

def sha256sum(input_file_name):
    """Computes the SHA256 hash of a binary file
    Returns the digest string or '' if an error occurred reading the file"""
    chunk_size = 1048576 # 1 MB
    file_sha256 = hashlib.sha256()
    try:
        with open(input_file_name, "rb") as f:
            byte = f.read(chunk_size)
            while byte:
                file_sha256.update(byte)
                byte = f.read(chunk_size)
    except IOError:
        print ('sha256sum: cannot open file: %s' % (input_file_name))
        return ''
    return file_sha256.hexdigest()

def sha1sum(input_file_name):
    """Computes the SHA1 hash of a binary file
    Returns the digest string or '' if an error occurred reading the file"""
    chunk_size = 1048576 # 1 MB
    file_sha1 = hashlib.sha1()
    try:
        with open(input_file_name, "rb") as f:
            byte = f.read(chunk_size)
            while byte:
                file_sha1.update(byte)
                byte = f.read(chunk_size)
    except IOError:
        print ('sha1sum: cannot open file: %s' % (input_file_name))
        return ''
    return file_sha1.hexdigest()

# -------------------------- File Constants -------------------------
"""Something else than the other file types. We do not support this file type."""
UNKNOWN=0 

"""An APK. It is not possible to differentiate a ZIP from an APK until we have looked inside the ZIP."""
APK=1    

"""A Dalvik Executable file. We do not check the file is valid/accepted by the verifier."""
DEX=2

"""An ARM ELF executable."""
ARM=3

"""A Java .class file"""
CLASS=4

"""A Zip file. Actually, this can also be a JAR or an APK until we have thoroughly checked."""
ZIP=5

"""A RARed file."""
RAR=6

"""We can probably add some more later: TAR, TGZ, BZ2..."""

def str_filetype(filetype):
    """Provide as input a droidutil filetype (APK, DEX, ARM...) and returns the corresponding string"""
    if filetype == APK:
        return "APK"
    if filetype == DEX:
        return "DEX"
    if filetype == ARM:
        return "ARM"
    if filetype == CLASS:
        return "CLASS"
    if filetype == ZIP:
        return "ZIP"
    if filetype == RAR:
        return "RAR"
    return "UNKNOWN"
    

def get_filetype(filename):
    """Returns an enumerate for the filetype corresponding to the given absolute filename.
    This function does not open the file or unzip it.
    It will return one of these:
    droidutil.ZIP
    droidutil.RAR
    droidutil.ARM
    droidutil.CLASS
    droidutil.DEX
    droidutil.UNKNOWN
    """
    filetype = magic.from_file(filename)
    if filetype == None:
        # this happens if magic is unable to find file type
        return UNKNOWN
    match = re.search('Zip archive data|zip|RAR archive data|executable, ARM|Java class|Dalvik dex|Java archive', filetype)
    if match == None:
        mytype = UNKNOWN
    else:
        typecase = { 'Zip archive data' : ZIP,
                     'zip' : ZIP,
                     'Java archive' : ZIP,
                     'RAR archive data' : RAR,
                     'executable, ARM' : ARM,
                     'Java class' : CLASS,
                     'Dalvik dex' : DEX,
                     'None' : UNKNOWN   }
        mytype = typecase[match.group(0)]
    return mytype


def get_elements(xmldoc, tag_name, attribute):
    """Returns a list of elements"""
    l = []
    for item in xmldoc.getElementsByTagName(tag_name) :
        value = item.getAttribute(attribute)
        l.append( repr( value ) )
    return l

def get_element(xmldoc, tag_name, attribute):
    for item in xmldoc.getElementsByTagName(tag_name) :
        value = item.getAttribute(attribute)
        if len(value) > 0 :
            return value
    return None

"""Very simple exception to raise when we found something. For instance to break a loop."""
class Found(Exception): pass

class matchresult:
    """Match information"""

    def __init__(self, thefile, theline, thelineno):
        """Represents a match for a keyword.
        Made of a filename and a line"""
        self.file = thefile
        self.line = theline
        self.lineno = thelineno
    
    def __repr__(self):
        return 'file=%s lineno=%d line=%s' % (self.file, self.lineno, self.line)

    def __str__(self):
        if len(self.file) > 70:
            f = '...'+self.file[-70:]
        else:
            f = self.file
        return 'file=%50s no=%4d line=%30s' % (f, self.lineno, self.line)

def recursive_search(search_regexp, directory, exception_list=[], verbose=False):
    """Recursively search in a directory except in some subdirectories
    The exception list actually is a list of regexp for directories.
    
    Returns a dictionary of list of matches:
    match[ keyword ] = [ <'filename', 'matching line content', 'lineno'>,
                         <'filename', 'matching line content', 'lineno'>,
                         <'filename', 'matching line content', 'lineno'>, ]

    We can only have one match per line. Otherwise, this won't work we should be using re.findall
    """
    matches = defaultdict(list)

    if verbose:
        print("Searching in " + directory + " for " + search_regexp.decode('utf-8'))
        print("Exceptions: %s" % (str(exception_list)))

    for entry in os.listdir(directory):
        current_entry = os.path.join(directory, entry)
        try: 
            if os.path.isfile(current_entry):
                for exception in exception_list:
                    match = re.search(exception, current_entry)
                    if match != None:
                        # skip this file
                        raise Found

                # ok, this file must be searched
                lineno = 0
                for line in open(current_entry, 'rb'):
                    lineno += 1
                    match = re.search(search_regexp, line)
                    if match != None:
                        if verbose:
                            print("Match: File: " +entry+ " Keyword: " +match.group(0).decode('utf-8', errors='replace') + " Line: " + line.decode('utf-8', errors='replace'))
                        """match.group(0) only provides one match per line if we need more, 
                        re.search is not appropriate
                        and should be replaced by re.findall"""
                        matches[ match.group(0).decode('utf-8', errors='replace') ].append(matchresult(current_entry, line, lineno))


            if os.path.isdir(current_entry):
                for exception in exception_list:
                    match = re.search(exception, current_entry)
                    if match != None:
                        # skip this directory
                        raise Found

                # this directory is not in the exception list, we must search it recursively
                try:
                    hismatches = recursive_search(search_regexp, current_entry, exception_list, verbose)
                    # merge in those results
                    for key in hismatches.keys():
                        matches[ key ].extend( hismatches[ key ] )
                except RuntimeError:
                    # we get this when there are too many recursive dirs
                    pass # next


        except Found:
            pass # go to next entry

    return matches