#!/usr/bin/python

import sys, os, os.path, hashlib, subprocess, tempfile, magic, multiprocessing
from optparse import OptionParser
try:
	import tlsh
	tlshscanning = True
except:
	tlshscanning = False

## Binary Analysis Tool
## Copyright 2013-2015 Armijn Hemel for Tjaldur Software Governance Solutions
## Licensed under Apache 2.0, see LICENSE file for details

'''
This program compares two binaries (firmwares, files, etc.) in various ways to
see how close they are.

There are a few scenarios where this program can be used:

1. comparing an old firmware (that is already known and which has been verified)
to a new firmware (update) and see if there are any big differences.
2. comparing a firmware to a rebuild of a firmware as part of compliance
engineering
3. comparing two binaries to see if a certain security bug might be present

A few assumptions are made:

1. both firmwares were unpacked using the Binary Analysis Tool
2. files that are in the original firmware, but not in the new firmware, are
not reported (example: removed binaries). This might change in a future version.
3. files that are in the new firmware but not not in the original firmware are
reported, since this would mean additions to the firmware, possibly with
license conditions or security concerns.
4. files that appear in both firmwares but which are not identical are checked
using bsdiff and, if available, tlsh.

With just checksums it is easy to find the files that are different. Using BSDIFF
and tlsh it becomes easier to see how big the difference really is.

Low values are probably not interesting at all:
* time stamps (BusyBox, Linux kernel, etc. record a time stamp in the binary)
* slightly different compiler settings

If the diffs get larger there are of course bigger changes.

This approach will make it easier to make a baseline scan of a firmware, then
find, prioritize and scan only the differences in an update of the firmware.
'''

## copied from bruteforce.py
def gethash(path, filename):
	scanfile = open("%s/%s" % (path, filename), 'r')
	h = hashlib.new('sha256')
	scanfile.seek(0)
	hashdata = scanfile.read(10000000)
	while hashdata != '':
		h.update(hashdata)
		hashdata = scanfile.read(10000000)
	scanfile.close()
	return h.hexdigest()

## method to compare binaries. Returns the amount of bytes that differ
## according to bsdiff, or 0 if the files are identical
def comparebinaries(path1, path2):
	basepath1 = os.path.basename(path1)
	dirpath1 = os.path.dirname(path1)
	basepath2 = os.path.basename(path2)
	dirpath2 = os.path.dirname(path2)
	## binaries are identical
	if gethash(dirpath1, basepath1) == gethash(dirpath2, basepath2):
		return 0
	difftmp = tempfile.mkstemp()
	os.fdopen(difftmp[0]).close()
	p = subprocess.Popen(["bsdiff", path1, path2, difftmp[1]], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
	## cleanup
	(stanout, stanerr) = p.communicate()
	diffsize = os.stat(difftmp[1]).st_size
	os.unlink(difftmp[1])
	return diffsize

def main(argv):
	parser = OptionParser()
	parser.add_option("-n", "--new", action="store", dest="newdir", help="path to BAT results of new binary", metavar="DIR")
	parser.add_option("-o", "--original", action="store", dest="olddir", help="path to BAT results of original binary", metavar="DIR")
	(options, args) = parser.parse_args()
	if options.olddir == None or options.newdir == None:
		parser.error("Supply paths to both directories")

	if not os.path.exists(options.olddir):
		parser.error("Directory \"%s\" does not exist" % (options.olddir,))

	if not os.path.exists(options.newdir):
		parser.error("Directory \"%s\" does not exist" % (options.newdir,))

	ms = magic.open(magic.MAGIC_NONE)
	ms.load()

	## The goal is to check the files from the new binary and
	## compare them with files from the old binary
	## First build a list of files in the original binary
	## Then do the same for the new binary and check:
	## * does a file with the same name exist in the original binary
	## * do the files differ
	## and report about it
	checkfiles = {}
	osgen = os.walk(options.olddir)
	try:
		while True:
			i = osgen.next()
			for p in i[2]:
				if os.path.islink(os.path.join(i[0], p)):
					continue
				if not os.path.isfile(os.path.join(i[0], p)):
					continue
				if not checkfiles.has_key(p):
					checkfiles[p] = [os.path.join(i[0], p)]
				else:
					checkfiles[p].append(os.path.join(i[0],p))
	except StopIteration:
		pass
	notfoundnewdir = []
	notfoundorigdir = []
	## now loop over the new binary
	osgen = os.walk(options.newdir)
	try:
		while True:
			i = osgen.next()
			for p in i[2]:
				if os.path.islink(os.path.join(i[0], p)):
					continue
				if not os.path.isfile(os.path.join(i[0], p)):
					continue
				## name of this file can't be found in old scan tree, so report
				if not checkfiles.has_key(p):
					notfoundnewdir.append(p)
				else:
					for j in checkfiles[p]:
						diff = comparebinaries(j, os.path.join(i[0], p))
						## bsdiff between two identical files is 143 bytes
						if diff <= 143 :
							continue
						else:
							print "* %s and %s differ %d bytes according to bsdiff" % ("%s/%s" % (i[0], p), j, diff)
	except StopIteration:
		pass

	if notfoundnewdir != []:
		print "\nThe following files from the new binary were not found in the original binary:"
		for i in notfoundnewdir:
			print "* %s" % i

	## TODO: check for files in the original directory as well, although
	## removal of files might not be as interesting
	if notfoundorigdir != []:
		print "\nThe following files from the original binary were not found in the new binary:"
		for i in notfoundorigdir:
			print "* %s" % i

if __name__ == "__main__":
        main(sys.argv)