''' BotDigger detects Domain Generation Algorithm based botnets based on DNS traffic Copyright (C) <2015> <Han Zhang> BotDigger is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. BotDigger is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Contact information: Email: zhanghan0116@gmail.com ''' from __future__ import division import optparse import Levenshtein import getopt import socket import dpkt, dpkt.dns import sys import subprocess import datetime import re import math import socket import pythonwhois import time import smtplib import numpy import pcap import operator import bz2 import gzip import pylzma as xz import zipfile from os import walk from dnslib import * from collections import * from wordsegment import segment from optparse import OptionParser from netaddr import IPNetwork, IPAddress botDetected = 0 enableEmail = 0 enable2LDProbe = 0 enableVerbose = 0 localMaxSuspicious2LD = 2 timeInterval = 600 # time window thresholdSuspiciousNXDOMAIN = 5 #thresholdDynamic = 3 timestampPeriodBegin = 0.0 thresholdDistance = 0.2 thresholdSimilarity = 0.1 # threshold for clustering algorithm thresholdBotsOneCluster = 4 thresholdSignature = 16 temporalWindow = 5 receiver = "" emailContents = "" outputContentBot = "" outputFilePrefix = "" hostDict = dict() botDict = dict() tldDict = dict() ccTldDict = dict() nonCcTldDict = dict() blWebsitesDict = dict() dictionaryDict = dict() configWordsDict = dict() existingSLDDict = dict() popularDomainDict = dict() bigEnterpriseDict = dict() excludedHostsDict = dict() dynamicDomainDict = dict() excludedDomainsDict = dict() #allHostDict = dict() #responseNXDomainDict = dict() #responseSuspiciousNXDomainDict = dict() # From http://www.networksorcery.com/enp/protocol/dns.htm type_table = {1:"A", # IP v4 address, RFC 1035 2:"NS", # Authoritative name server, RFC 1035 5:"CNAME", # Canonical name for an alias, RFC 1035 6:"SOA", # Marks the start of a zone of authority, RFC 1035 12:"PTR", # Domain name pointer, RFC 1035 13:"HINFO", # Host information, RFC 1035 15:"MX", # Mail exchange, RFC 1035 28:"AAAA", # IP v6 address, RFC 3596 } # From http://www.garykessler.net/library/file_sigs.html compressed_file_table = { "\x42\x5a\x68": "bz2", "\x1f\x8b\x08": "gz", "\xfd\x37\x7a\x58\x5a\x00": "xz", "\x50\x4b\x03\x04": "zip" } # Form http://stackoverflow.com/a/13044946/3934402 def file_type(filename): max_len = max(len(x) for x in compressed_file_table) with open(filename) as f: file_start = f.read(max_len) for magic, filetype in compressed_file_table.items(): if file_start.startswith(magic): return filetype return None def openFile(filename, modes): filetype = file_type(filename) if filetype is None: return open(filename, modes) elif filetype == "bz2": return bz2.BZ2File(filename) elif filetype == "gz": return gzip.open(filename) elif filetype == "xz": with open(filename, modes) as f: return xz.LZMAFile(f) elif filetype == "zip": return zipfile.ZipFile(filename) else: # should never get here raise LookupError("filetype is invalid") class Host(): def __init__(self): try: self.IP = "" self.startTime = 0 self.startTimePeriod = 0 self.endTime = 0 self.labeled = 0 self.noError = 0 self.formatError = 0 self.serverFail = 0 self.NXDOMAIN = 0 self.suspiciousNXDOMAIN = 0 self.suspiciousNXDOMAIN2LD = 0 self.notImplement = 0 self.refused = 0 self.noErrorDict = dict() self.formatErrorDict = dict() self.serverFailDict = dict() self.NXDOMAINDict = dict() self.suspiciousNXDOMAINDict = dict() self.suspiciousNXDOMAIN2LDDict = dict() self.suspiciousNXDOMAINPeriodDict = dict() self.notImplementDict = dict() self.refusedDict = dict() self.suspiciousNXDOMAINList = list() self.suspiciousNXDOMAINPeriodList = list() self.suspiciousNXDOMAINPeriodCountList = list() except: print "Failure Initializing Host" return None def __eq__(self, other): if not isinstance(other, Host): raise NotImplementedError return self.IP==other.IP def initialize_tables() : global type_table # functions to load files def loadNetworkPrefix(networkPrefixFile, networkPrefixDict): with open(networkPrefixFile, 'r') as fp: for line in fp: info = line.strip('\n') networkPrefixDict[info] = None def loadExcludedHosts(excludedHostsFile, excludedHostsDict): with open(excludedHostsFile, 'r') as fp: for line in fp: info = line.strip('\n').lower().split('\t') excludedHostsDict[info[0]] = None def loadDictionary(dictionaryFile, dictionaryDict): with open(dictionaryFile, 'r') as fp: for line in fp: info = line.strip('\n').lower() if len(info) >= 3: dictionaryDict[info] = None def loadBLWebsites(blWebsitesDict, blWebsitesFile): with open(blWebsitesFile, 'r') as fp: for line in fp: info = line.split() if info[0] not in blWebsitesDict: blWebsitesDict[info[0]] = None def loadConfigWords(configWordsDict, configWordsFile): with open(configWordsFile, 'r') as fp: for line in fp: info = line.split() if info[0] not in configWordsDict: configWordsDict[info[0]] = None def loadSLDExistence(sldExistenceFile, existingSLDDict): with open(sldExistenceFile, 'r') as fp: for line in fp: info = line.strip('\n').lower().split(' ') if len(info) >= 2: existingSLDDict[info[0]] = int(info[1]) def loadDynamicDomain(dynamicDomainDict, dynamicDomainFile): with open(dynamicDomainFile, 'r') as fp: for line in fp: info = line.strip("\n") if info not in dynamicDomainDict: dynamicDomainDict[info] = None def loadDNSServer(dnsServerDict, dnsServerFile): with open(dnsServerFile, 'r') as fp: for line in fp: info = line.strip("\n") if info not in dnsServerDict: dnsServerDict[info] = None def loadKnownTLD(tldDict, ccTldDict, nonCcTldDict, tldListFile): with open(tldListFile, 'r') as fp: for line in fp: info = line.split('\t') if info[0] not in tldDict: tldDict[info[0]] = None if "country" in info[1]: ccTldDict[info[0]] = None else: nonCcTldDict[info[0]] = None def loadExludedDomains(excludedDomainsDict, excludedDomainsFile, tldDict, ccTldDict): with open(excludedDomainsFile, 'r') as fp: for line in fp: info = line.strip("\n") (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(info, ccTldDict, tldDict) if domain2LD not in excludedDomainsDict: excludedDomainsDict[domain2LD] = None def loadBigEnterprises(bigEnterpriseDict, bigEnterpriseFile, tldDict, ccTldDict): with open(bigEnterpriseFile, 'r') as fp: for line in fp: info = line.strip("\n").split("\t") (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(info[2], ccTldDict, tldDict) if domain2LD not in bigEnterpriseDict: bigEnterpriseDict[domain2LD] = None def loadPopularDomain(popularDomainDict, popularDomainFile, tldDict, ccTldDict): with open(popularDomainFile, 'r') as fp: lineCount = 0 for line in fp: info = line.strip("\n").split(",") (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(info[1], ccTldDict, tldDict) lineCount = lineCount + 1 if lineCount > 1000: #consider the first 1,000 domains from Alexa break if domain2LD not in popularDomainDict: popularDomainDict[domain2LD] = None #end of load fuctions # filters to remove unsuspicious domains def distanceDomain(domain, DomainDict, ccTldDict, tldDict): similarDomain = "" minDistance = sys.maxint level = domain.split(".") if len(level) <=1: return ("not a domain", sys.maxint) (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict) for popularDomain in DomainDict: distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8')) if distance < minDistance: minDistance = distance similarDomain = popularDomain #debug #sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance)) if len(similarDomain) > 0: return (similarDomain, minDistance/float(len(similarDomain))) else: return (domain2LD, 0) # check whether a domain contains invalid TLD def searchTLDList(domain, tldDict): single = domain.split(".") singleDot = "." + single[len(single)-1] if singleDot in tldDict: #print domain, "has a known tld" return 1 else: #print domain, "has an unknown tld" return 0 # check whether a domain contains overloaded DNS query def searchBLWebsites(domain, blWebsitesDict): for website in blWebsitesDict: if website in domain: #print domain, "is used for overloaded DNS query" return 0 return 1 # check whether a domain contains configuration words def searchConfigWords(domain, configWordsDict): for configWord in configWordsDict: if configWord in domain: #print domain, "is related to configuration word:", configWord return 0 return 1 # check whether a domain contains ".arpa" def arpaNXDOMAIN(domain): if "in-addr.arpa" in domain: return 0 elif "ip6.arpa" in domain: return 0 else: return 1 # check whether a domain contains PC name, e.g., HAN-PC # currently not used def searchLocalPC(domain): return 1 # check whether a domain contains IP address, e.g., 1.2.3.4.example.com, 1-2-3-4.example.com def searchIPDomain(domain): count1 = 0 count2 = 0 ipPattern = re.compile("[0-9]{1,3}") info1 = domain.split(".") for level1 in info1: if ipPattern.match(level1): count1 = count1 + 1 else: count1 = 0 if count1 == 4: #print domain, "contains an IP" return 0 count2 = 0 info2 = level1.split("-") for level2 in info2: if ipPattern.match(level2): count2 = count2 + 1 else: count2 = 0 if count2 == 4: #print domain, "contains an IP" return 0 return 1 # check whether a domain contains repeated TLDs, e.g., www.example.com.foo.com # need to improve def searchRepeatTLD(domain, tldDict): count = 0 start = 0 firstSingle = 0 count = 0 repeatTld = "" level = domain.split(".") offset = 0 if len(level) == 1: return 1 for loop in range(0, len(level)): subdomain = "." + level[len(level)-1-loop] if subdomain in tldDict: if (loop != offset+1) and (loop != 0): return 0 offset = loop return 1 for single in level: count = count + 1 if firstSingle == 0: firstSingle = 1 continue dotSingle = "." + single dotSingleDot = "." + single + "." if dotSingle in tldDict: if dotSingle == level[len(level)-1]: return 0 if dotSingleDot in domain[(domain.index(dotSingle)+1):]: return 0 return 1 # label whether a domain is suspicious def labelSuspiciousDomain(domain, tldDict, ccTldDict): # ignore domain that contains overloaded DNS queries suspiciousDomain = 1 if searchBLWebsites(domain, blWebsitesDict) == 0: suspiciousDomain = 0 # ignore domain that contains invalid TLD if searchTLDList(domain, tldDict) == 0: suspiciousDomain = 0 # ignore domain that contains repeated TLD if searchRepeatTLD(domain, tldDict) == 0: suspiciousDomain = 0 # ignore domain that contains "in-addr.arpa/ip6.arpa" if arpaNXDOMAIN(domain) == 0: suspiciousDomain = 0 # ignore domain that contains configuration words if searchConfigWords(domain, configWordsDict) == 0: suspiciousDomain = 0 # ignore "local" (e.g., HAN-PC.colostate.edu.edu) if searchLocalPC(domain) == 0: suspiciousDomain = 0 # ignore domain that contains an IP if searchIPDomain(domain) == 0: suspiciousDomain = 0 # ignore domain that contains typo for popular domain (typoDomain, distance) = distanceDomain(domain, popularDomainDict, ccTldDict, tldDict) if distance <= thresholdDistance: suspiciousDomain = 0 # ignore domain that contains typo for big enterprise domain (typoDomain, distance) = distanceDomain(domain, bigEnterpriseDict, ccTldDict, tldDict) if distance <= thresholdDistance: suspiciousDomain = 0 # ignore domain that contains excluded domain or such typo (e.g., colostate) (typoDomain, distance) = distanceDomain(domain, excludedDomainsDict, ccTldDict, tldDict) if distance <= thresholdDistance: suspiciousDomain = 0 return suspiciousDomain # end of functions to label/remove suspicious domains # functions of extract linguistic attributes from domains def strEntropy(levelDomain): freq = 0.0 entropy = 0.0 normalizedEntropy = 0.0 if len(levelDomain) == 0: return (0, 0) for character in set(levelDomain): freq = levelDomain.count(character)/float(len(levelDomain)) if freq > 0: entropy = entropy - freq * math.log(freq, 2) if math.log(len(levelDomain), 2) > 0: normalizedEntropy = entropy/math.log(len(levelDomain), 2) #debug #sys.stdout.write("str: %s, entropy: %f, normalizedEntropy: %f\n" % (levelDomain, entropy, normalizedEntropy)) return (entropy, normalizedEntropy) def domainLevels(domain, ccTldDict, tldDict): domainLevel = 0 info = domain.split(".") for dynamicDomain in dynamicDomainDict: if dynamicDomain in domain: infoDynamic = dynamicDomain.split(".") domainLevel = len(info) - len(infoDynamic) return domainLevel cc = "." + info[len(info)-1] if cc in ccTldDict: if len(info) >=2: tld = "." + info[len(info)-2] if tld in tldDict: # e.g., www.hello.example.com.cn domainLevel = len(info)-2 else: # e.g., www.hello.example.cn domainLevel = len(info)-1 else: # e.g., www.hello.example.com domainLevel = len(info)-1 return domainLevel def wordBreak(word, dictionaryDict): info = segment(word) length = 0 for word in info: if word in dictionaryDict: length = length + len(word) return length def extractLevelDomain(domain, ccTldDict, tldDict): domain2LD = "" domain3LD = "" domain2LDs = "" domain3LDs = "" info = domain.split(".") for dynamicDomain in dynamicDomainDict: dynamicDomainDot = "." + dynamicDomain if dynamicDomainDot in domain: infoDynamic = dynamicDomain.split(".") if len(info) >= len(infoDynamic) + 1: domain2LD = info[len(info)-len(infoDynamic) - 1] domain2LDs = domain2LD + dynamicDomainDot if len(info) >= len(infoDynamic) + 2: domain3LD = info[len(info)-len(infoDynamic) - 2] domain3LDs = domain3LD + "." + domain2LDs return (domain2LD, domain3LD, domain2LDs, domain3LDs) cc = "." + info[len(info)-1] if cc in ccTldDict: if len(info) >=2: tld = "." + info[len(info)-2] if tld in tldDict: # e.g., www.hello.example.com.cn if len(info) >= 3: domain2LD = info[len(info)-3] domain2LDs = domain2LD + "." + info[len(info)-2] + "." + info[len(info)-1] if len(info) >= 4: domain3LD = info[len(info)-4] domain3LDs = domain3LD + "." + domain2LDs else: # e.g., www.hello.example.cn domain2LD = info[len(info)-2] domain2LDs = domain2LD + "." + info[len(info)-1] if len(info) >= 3: domain3LD = info[len(info)-3] domain3LDs = domain3LD + "." + domain2LDs else: # e.g., www.hello.example.com if len(info) >=2: domain2LD = info[len(info)-2] domain2LDs = domain2LD + "." + info[len(info)-1] if len(info) >=3: domain3LD = info[len(info)-3] domain3LDs = domain3LD + "." + domain2LDs return (domain2LD, domain3LD, domain2LDs, domain3LDs) # extract linguistic attributes from domain def extractAttributes(domain, ccTldDict, tldDict, fpOutput): global enableVerbose attibutesList = list() #if enableVerbose == 1: # fpOutput.write("domain: %s\n" % domain) (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict) length2LD = len(domain2LD) length3LD = len(domain3LD) #if enableVerbose == 1: # fpOutput.write("domain2LD: %s, domain3LD: %s\n" % (domain2LD, domain3LD)) # 1,2: length of dictionanry words in 2LD and 3LD # 3,4: percent of dictionanry words in 2LD and 3LD meaningfulWordsLength2LD = wordBreak(domain2LD, dictionaryDict) meaningfulWordsLength3LD = wordBreak(domain3LD, dictionaryDict) if length2LD == 0: meaningfulWordsPercent2LD = 0 else: meaningfulWordsPercent2LD = meaningfulWordsLength2LD/float(length2LD) if length3LD == 0: meaningfulWordsPercent3LD = 0 else: meaningfulWordsPercent3LD = meaningfulWordsLength3LD/float(length3LD) attibutesList.append(meaningfulWordsLength2LD) attibutesList.append(meaningfulWordsLength3LD) attibutesList.append(meaningfulWordsPercent2LD) attibutesList.append(meaningfulWordsPercent3LD) #if enableVerbose == 1: # fpOutput.write(" meaningfulWordsLength2LD: %d\n meaningfulWordsPercent2LD: %f\n meaningfulWordsLength3LD: %d\n meaningfulWordsPercent3LD: %f\n" % (meaningfulWordsLength2LD, meaningfulWordsPercent2LD, meaningfulWordsLength3LD, meaningfulWordsPercent3LD)) # 5,6: the length of the longest meaningful substring in 2LD and 3LD # 7,8: percent of the length of the longest meaningful substring in 2LD and 3LD lengthLMS2LD = 0 lengthLMS3LD = 0 for word in dictionaryDict: if word in domain2LD: if len(word) > lengthLMS2LD: lengthLMS2LD = len(word) for word in dictionaryDict: if word in domain3LD: if len(word) > lengthLMS3LD: lengthLMS3LD = len(word) if length2LD == 0: percentLMS2LD = 0 else: percentLMS2LD = lengthLMS2LD/float(length2LD) if length3LD == 0: percentLMS3LD = 0 else: percentLMS3LD = lengthLMS3LD/float(length3LD) attibutesList.append(lengthLMS2LD) attibutesList.append(lengthLMS3LD) attibutesList.append(percentLMS2LD) attibutesList.append(percentLMS3LD) #if enableVerbose == 1: # fpOutput.write(" lengthLMS2LD: %d\n percentLMS2LD: %f\n lengthLMS3LD: %d\n percentLMS3LD: %f\n" % (lengthLMS2LD, percentLMS2LD, lengthLMS3LD, percentLMS3LD)) # 9,10,11,12: entropy, normalizedEntropy in 2LD and 3LD (entropy2LD, normalizedEntropy2LD) = strEntropy(domain2LD) (entropy3LD, normalizedEntropy3LD) = strEntropy(domain3LD) attibutesList.append(entropy2LD) attibutesList.append(normalizedEntropy2LD) attibutesList.append(entropy3LD) attibutesList.append(normalizedEntropy3LD) #if enableVerbose == 1: # fpOutput.write(" entropy2LD: %f\n normalizedEntropy2LD: %f\n entropy3LD: %f\n normalizedEntropy3LD: %f\n" % (entropy2LD, normalizedEntropy2LD, entropy3LD, normalizedEntropy3LD)) # 13: number of levels domainLevel = domainLevels(domain, ccTldDict, tldDict) attibutesList.append(domainLevel) #if enableVerbose == 1: # fpOutput.write(" domainLevel: %d\n" % domainLevel) # 14, 15: length of 2LD and 3LD attibutesList.append(len(domain2LD)) attibutesList.append(len(domain3LD)) #if enableVerbose == 1: # fpOutput.write(" length2LD: %d\n length3LD: %d\n" % (length2LD, length3LD)) # 16, 17: number of distinct digital characters in 2LD and 3LD numberStr2LD = re.findall('\d', domain2LD) distinctNumbers2LD = len(set(numberStr2LD)) numberStr3LD = re.findall('\d', domain3LD) distinctNumbers3LD = len(set(numberStr3LD)) attibutesList.append(distinctNumbers2LD) attibutesList.append(distinctNumbers3LD) #if enableVerbose == 1: # fpOutput.write(" distinctNumbers2LD: %d\n distinctNumbers3LD: %d\n" % (distinctNumbers2LD, distinctNumbers3LD)) # 18, 19: percent of distinct digital characters in 2LD and 3LD if length2LD == 0: distinctNumbers2LDPercent = 0 else: distinctNumbers2LDPercent = distinctNumbers2LD/float(length2LD) if length3LD == 0: distinctNumbers3LDPercent = 0 else: distinctNumbers3LDPercent = distinctNumbers3LD/float(length3LD) attibutesList.append(distinctNumbers2LDPercent) attibutesList.append(distinctNumbers3LDPercent) #if enableVerbose == 1: # fpOutput.write(" uniqueNumbers2LD: %d\n uniqueNumbers2LDPercent: %f\n uniqueNumbers3LD: %d\n uniqueNumbers3LDPercent: %f\n" % (distinctNumbers2LD, distinctNumbers2LDPercent, distinctNumbers3LD, distinctNumbers3LDPercent)) # 20, 21: number of distinct characters in 2LD and 3LD distinctChar2LD = len(set(re.sub("[0-9]", "", domain2LD))) distinctChar3LD = len(set(re.sub("[0-9]", "", domain3LD))) attibutesList.append(distinctChar2LD) attibutesList.append(distinctChar3LD) #if enableVerbose == 1: # fpOutput.write(" distinctChar2LD: %d\n distinctChar3LD: %d\n" % (distinctChar2LD, distinctChar3LD)) # 22, 23: percent of distinct characters in 2LD and 3LD if length2LD == 0: distinctChar2LDPercent = 0 else: distinctChar2LDPercent = distinctChar2LD/float(length2LD) if length3LD == 0: distinctChar3LDPercent = 0 else: distinctChar3LDPercent = distinctChar3LD/float(length3LD) attibutesList.append(distinctChar2LDPercent) attibutesList.append(distinctChar3LDPercent) #if enableVerbose == 1: # fpOutput.write(" distinctChar2LD: %d\n distinctChar2LDPercent: %f\n distinctChar3LD: %d\n distinctChar3LDPercent:%f\n" % (distinctChar2LD, distinctChar2LDPercent, distinctChar3LD, distinctChar3LDPercent)) return attibutesList # end of functions of extract linguistic attributes from domains # update host information def updateResponseDomain(hostDict, domain, qtype, rcode, ip, timestamp, domainIP, fpOutput): global tldDict global ccTldDict suspiciousDomain = 1 domain = domain.lower() if qtype != 1: return if (rcode == 0): hostDict[ip].noErrorDict[domain] = (timestamp, domainIP) if (rcode == 3): # filter non-malicious domains suspiciousDomain = labelSuspiciousDomain(domain, tldDict, ccTldDict) # update suspicious domain dict if suspiciousDomain == 1: #record the NXDomains queried by a host hostDict[ip].suspiciousNXDOMAIN = hostDict[ip].suspiciousNXDOMAIN + 1 hostDict[ip].suspiciousNXDOMAINDict[domain] = timestamp hostDict[ip].suspiciousNXDOMAINList.append((domain, timestamp)) if domain not in hostDict[ip].NXDOMAINDict: hostDict[ip].NXDOMAINDict[domain] = 1 else: hostDict[ip].NXDOMAINDict[domain] = hostDict[ip].NXDOMAINDict[domain] + 1 hostDict[ip].suspiciousNXDOMAINPeriodDict[domain] = 1 hostDict[ip].endTime = timestamp (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict) if domain2LDs in hostDict[ip].suspiciousNXDOMAIN2LDDict: hostDict[ip].suspiciousNXDOMAIN2LDDict[domain2LDs] = timestamp else: hostDict[ip].suspiciousNXDOMAIN2LDDict[domain2LDs] = timestamp hostDict[ip].suspiciousNXDOMAIN2LD += 1 # functions for bot detection def botDetection(host, fpOutput): global outputContentBot domainAttributesList = list() domainsList = list() # locate the time during when bot contacts C&C servers. # temporal feature 1: number of suspicious NXDomains increases quickly # temporal feature 2: number of suspicious NXDomains stops increasing (increaseStart, increaseEnd, increaseStartTimestamp) = increaseCUSUM(host, localMaxSuspicious2LD, thresholdBotsOneCluster, fpOutput) if (increaseStart != -1 ) and (increaseEnd != -1): (decreaseStart, decreaseEnd, decreaseEndTimestamp) = decreaseCUSUM(host, localMaxSuspicious2LD, thresholdBotsOneCluster, increaseStart, fpOutput) if decreaseStart >= increaseStart: if enableVerbose == 1: fpOutput.write("increase starts at offset %d, time: %f, decrease ends at offset %d, time: %f\n" % (increaseStart, increaseStartTimestamp, decreaseEnd, decreaseEndTimestamp)) else: return else: if enableVerbose == 1: fpOutput.write("No temporal evidence found\n") return # focus on the suspicious NXDomains queried during the above time window for domain in host.suspiciousNXDOMAINDict: if (host.suspiciousNXDOMAINDict[domain] > increaseStartTimestamp-60) and (host.suspiciousNXDOMAINDict[domain] < decreaseEndTimestamp + 60): domainsList.append(domain) outputContentBot += "Suspicious NXDomains queried by %s:\n" % host.IP if enableVerbose == 1: fpOutput.write("Suspicious NXDomains queried by %s:\n" % host.IP) for domain in domainsList: outputContentBot += "%s\n" % domain if enableVerbose == 1: fpOutput.write("%s\n" % domain) domains = removeLegitimateDomains(domainsList, ccTldDict, tldDict, dynamicDomainDict, dictionaryDict) outputContentBot += "Suspicious NXDomains queried by %s after removing the ones containing registered 2LD\n" % host.IP if enableVerbose == 1: fpOutput.write("Suspicious NXDomains queried by %s after removing the ones containing registered 2LD\n" % host.IP) for domain in domains: outputContentBot += "%s\n" % domain if enableVerbose == 1: fpOutput.write("%s\n" % domain) if len(domains) < thresholdBotsOneCluster: if enableVerbose == 1: fpOutput.write("number of domains: %d is less than thresholdBotsOneCluster: %d\n" % (len(domains), thresholdBotsOneCluster)) return similarityMatrix = [[0 for x in range(0, len(domains))] for x in range(0, len(domains))] for index in range (0, len(domains)): domainAttributes = extractAttributes(domains[index], ccTldDict, tldDict, fpOutput) domainAttributesList.insert(index, domainAttributes) for index1 in range (0, len(domains)): for index2 in range (index1+1, len(domains)): similarityMatrix[index1][index2] = similarityCalculation(domainAttributesList[index1], domainAttributesList[index2] ) similarityMatrix[index2][index1] = similarityMatrix[index1][index2] # output the similarity matrix ''' if enableVerbose == 1: for index1 in range (0, len(domains)): fpOutput.write("%d %s %f\n" %( index1, domains[index1], host.suspiciousNXDOMAINDict[domains[index1]])) for index1 in range (0, len(domains)): for index2 in range (0, len(domains)): fpOutput.write( "%3.3f " % similarityMatrix[index1][index2]) fpOutput.write("\n") ''' singleLinkageClustering(similarityMatrix, domains, host, fpOutput) def singleLinkageClustering(similarityMatrix, domains, host, fpOutput): global enableVerbose global outputContentBot ip = host.IP index1 = 0 index2 = 0 pair1 = 0 pair2 = 0 loop = 0 count = 0 domainsOneCluster = 0 mostDomainsOneCluster = 0 mostDomainsClusterID = 0 label = 1 leastSimilarity = 2 clusters = [0 for x in range(0, len(domains))] for loop in range(0, len(domains)): clusters[loop] = loop whileLoop = 0 while (1): leastSimilarity = 1 pair1 = 0 pair2 = 0 for index1 in range(0, len(domains)-1): for index2 in range(index1+1, len(domains)): if similarityMatrix[index1][index2] < leastSimilarity and clusters[index1] != clusters[index2]: leastSimilarity = similarityMatrix[index1][index2] pair1 = index1 pair2 = index2 if enableVerbose == 1: fpOutput.write("\nLeast Similarity is: %f between cluster %d and cluster %d\n" % (leastSimilarity, pair1, pair2)) if leastSimilarity >= thresholdSimilarity: if mostDomainsOneCluster >= thresholdBotsOneCluster: # check the existance of sld, how long it has been registered, and the contacts information outputContentBot += "Host %s is detected as suspicious based on the linguistic feature 1: %d suspicious NXDomains are clustered together since their similarity is less than %f\n" % (ip, mostDomainsOneCluster, thresholdSimilarity) outputContentBot += "The largest cluster has %d domains\n" % mostDomainsOneCluster fpOutput.write("Host %s is detected as suspicious based on the linguistic feature 1: %d suspicious NXDomains are clustered together since their similarity is less than %f\n" % (ip, mostDomainsOneCluster, thresholdSimilarity)) fpOutput.write("The largest cluster has %d domains\n" % (mostDomainsOneCluster)) detectCnC(clusters, domains, host, fpOutput) if ip in botDict: botDict[ip] = botDict[ip] + 1 else: botDict[ip] = 1 break if enableVerbose == 1: fpOutput.write("\nRound %d: Merge cluster %d and cluster %d\n" % (whileLoop + 1, clusters[pair1], clusters[pair2])) pair1Cluster = clusters[pair1] pair2Cluster = clusters[pair2] for loop in range(0, len(domains)): if (clusters[loop] == pair2Cluster): clusters[loop] = pair1Cluster if enableVerbose == 1: for loop in range(0, len(domains)): fpOutput.write("%d %s\n" % (clusters[loop], domains[loop])) count = 1 for index1 in range(0, len(domains)): label = 0 domainsOneCluster = 0 for index2 in range(0, len(domains)): if clusters[index2] == index1: if label == 0: if enableVerbose == 1: fpOutput.write("\nCluster %d \n" % count) label = 1 count = count + 1 domainsOneCluster = domainsOneCluster + 1 if enableVerbose == 1: fpOutput.write("domain: %d, %s\n" % (domainsOneCluster, domains[index2]) ) if enableVerbose == 1: if domainsOneCluster > 0: fpOutput.write("\nThis cluster has %d domains\n" % domainsOneCluster) if domainsOneCluster > mostDomainsOneCluster: mostDomainsOneCluster = domainsOneCluster whileLoop = whileLoop + 1 return 0 # calculate similarity between two domains def similarityCalculation(domain1AttributesList, domain2AttributesList): similarityScore = 0 # calculate overall similarity attributeSimilarity = 0 for i in range(0, len(domain1AttributesList)): if (float(domain1AttributesList[i]) == 0) and (float(domain2AttributesList[i]) == 0): attributeSimilarity = 0 else: attributeSimilarity = abs(float(domain1AttributesList[i]) - float(domain2AttributesList[i]))/max(float(domain1AttributesList[i]), float(domain2AttributesList[i])) similarityScore = similarityScore + attributeSimilarity ** 2 similarityScore = math.sqrt(similarityScore/len(domain1AttributesList)) return similarityScore # extract signatures from bot clusters of NXDomains def extractSignature(NXDomainList, fpOutput): global enableVerbose attributesMatrix = [] signatureLower = [] signatureUpper = [] signatureMin = [] signatureMax = [] for i in range(0,len(NXDomainList)): attributesMatrix.append(NXDomainList[i][1]) meanArray = numpy.mean(attributesMatrix, axis=0) stdArray = numpy.std(attributesMatrix, axis=0) for i in range(0,len(attributesMatrix[0])): signatureLower.append(meanArray[i] - 3*stdArray[i]) signatureUpper.append(meanArray[i] + 3*stdArray[i]) signatureMin.append(min(item[i] for item in attributesMatrix)) signatureMax.append(max(item[i] for item in attributesMatrix)) if enableVerbose == 1: fpOutput.write("signatures\n") fpOutput.write("%s\n" % str(signatureLower)) fpOutput.write("%s\n" % str(signatureUpper)) return (signatureLower, signatureUpper, signatureMin, signatureMax) # check whether a single domain matches the signature def compareSignature(attributes, signatureLower, signatureUpper, signatureMin, signatureMax, fpOutput): count = 0 for i in range(0, len(attributes)): #debug #sys.stdout.write("%d %d %f %f\n" % (len(record[2]), len(signatureMin), record[2][i], signatureMin[i])) if (attributes[i]>=signatureMin[i]) and (attributes[i]<=signatureMax[i]): #if (record[2][i]>=signatureLower[i]) and (record[2][i]<=signatureUpper[i]): count = count + 1 return count # extract signatures and then apply them on all the successfully resolved domains to detect C&C domain def detectCnC(clusters, domains, host, fpOutput): global botDetected global enableEmail global receiver global emailContents global enableVerbose global outputContentBot lastTimestamp = 0 firstTimestamp = 0 botsAttributesTotalList = list() domainList = list() botsAttributesList = list() botsNXDomainList = list() for j in set(clusters): for i in range(0, len(domains)): if clusters[i] == j: domainList.append(domains[i]) outputContentBot += "cluster: %d, domain: %s\n" % (j, domains[i]) fpOutput.write("cluster: %d, domain: %s\n" % (j, domains[i])) if len(domainList) >= thresholdBotsOneCluster: for domain in domainList: botsNXDomainList.append(domain) attributes = extractAttributes(domain, ccTldDict, tldDict, fpOutput) botsAttributesList.append([domain, attributes]) botsAttributesTotalList.append([domain, attributes]) if host.suspiciousNXDOMAINDict[domain] > lastTimestamp: lastTimestamp = host.suspiciousNXDOMAINDict[domain] if firstTimestamp == 0: firstTimestamp = host.suspiciousNXDOMAINDict[domain] if host.suspiciousNXDOMAINDict[domain] < firstTimestamp: firstTimestamp = host.suspiciousNXDOMAINDict[domain] (signatureLower, signatureUpper, signatureMin, signatureMax) = extractSignature(botsAttributesList, fpOutput) del botsAttributesList[:] del domainList[:] domainIP = "" CnCDetected = 0 CnCDomain = "" CnCDomainList = list() signatureMatchCountMax = 0 if enableVerbose == 1: fpOutput.write("host %s has %d noError domain\n" % (host.IP, len(host.noErrorDict))) for domain in host.noErrorDict.keys(): signatureMatchCountMax = 0 # apply the signatures on all the successfully resolved domains 60 seconds before and after the bot clusters if ((host.noErrorDict[domain][0] > firstTimestamp - 60) and (host.noErrorDict[domain][0] < lastTimestamp+60) and host.noErrorDict[domain][1]): #debug #fpOutput.write("domain: %s, timestamp: %f, firstTimestamp: %f, lastTimestamp: %f\n" % (domain, host.noErrorDict[domain][0], firstTimestamp-60, lastTimestamp+60)) suspiciousDomain = labelSuspiciousDomain(domain, tldDict, ccTldDict) if suspiciousDomain == 0: continue CnCDomain = domain domainIP = host.noErrorDict[domain][1] (signatureLower, signatureUpper, signatureMin, signatureMax) = extractSignature(botsAttributesTotalList, fpOutput) attributes = extractAttributes(CnCDomain, ccTldDict, tldDict, fpOutput) signatureMatchCount = compareSignature(attributes, signatureLower, signatureUpper, signatureMin, signatureMax, fpOutput) #debug #fpOutput.write("CnC candidate domain: %s matchs %d signatures\n" % (CnCDomain, signatureMatchCount)) if signatureMatchCount >= thresholdSignature: botDetected = 1 if CnCDetected == 0: CnCDetected = 1 outputContentBot += "Host %s is detected as bot based on linguistic feature 2: C&C domain is detected\n" % host.IP fpOutput.write("Host %s is detected as bot based on linguistic feature 2: C&C domain is detected\n" % host.IP) outputContentBot += "CnC domain: %s, IP: %s matches %d signature attributes\n" % (domain, domainIP, signatureMatchCount) fpOutput.write("CnC domain: %s, IP: %s matches %d signature attributes\n" % (domain, domainIP, signatureMatchCount)) CnCDomainList.append(domain) if CnCDetected == 1: outputContentBot += "\n" outputFileBot = outputFilePrefix + "-Bot-%s" % host.IP with open(outputFileBot, 'a') as fpOutputBot: fpOutputBot.write("%s\n" % outputContentBot) emailContents = "Host: %s is labeled as bot during %s and %s\n\nQueried NXDomains:\n" % (host.IP, datetime.datetime.fromtimestamp(firstTimestamp-60).strftime('%Y-%m-%d %H:%M:%S'), datetime.datetime.fromtimestamp(lastTimestamp+60).strftime('%Y-%m-%d %H:%M:%S')) for domain in botsNXDomainList: emailContents = emailContents + "%s\n" % domain emailContents = emailContents + "\nLabeled C&C domains:\n" for domain in CnCDomainList: emailContents = emailContents + "%s\n" % domain if (botDetected == 1) and (enableEmail == 1): #sys.stdout.write("Email to send: %s\n" % emailContents) sendEmail(emailContents, receiver) botDetected = 0 def outlierNXDomain2LD(hostDict, fpOutput): global enableVerbose # if the number of hosts is very few, we consider all of them as outliers if len(hostDict) < 10: return 0 count = 0 SuspiciousNXDomain2LDArray = list() SuspiciousNXDomain2LDSum = 0 outlierMatrix = list() if enableVerbose == 1: fpOutput.write("outlierNXDomain2LD\nhost number: %d\n" % len(hostDict)) for host in hostDict: if enableVerbose == 1: if hostDict[host].suspiciousNXDOMAIN2LD > 0: fpOutput.write("%s %d\n" % (host, hostDict[host].suspiciousNXDOMAIN2LD)) SuspiciousNXDomain2LDArray.append(hostDict[host].suspiciousNXDOMAIN2LD) SuspiciousNXDomain2LDSum = SuspiciousNXDomain2LDSum + hostDict[host].suspiciousNXDOMAIN2LD average = numpy.mean(SuspiciousNXDomain2LDArray) stddev = numpy.std(SuspiciousNXDomain2LDArray) if enableVerbose == 1: fpOutput.write("mean: %f, std: %f, threshold: %f\n" % (average, stddev, average+3*stddev)) return average+3*stddev # remove the domains whose 2LD is registered, we consider such domains are legitimate # this function is NOT used if sldExistenceFile is not specified def removeLegitimateDomains(domainsList, ccTldDict, tldDict, dynamicDomainDict, dictionaryDict): subdomain = "" domainCount = 0 domainsListReturn = domainsList subdomainDict = dict() domainToDelList = list() for index in range(0, len(domainsList)): info = domainsList[index].split(".") (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domainsList[index], ccTldDict, tldDict) subdomain = domain2LDs if subdomain not in dynamicDomainDict: wwwSubdomain = "www." + subdomain if (subdomain in existingSLDDict) or (wwwSubdomain in existingSLDDict): if (subdomain in existingSLDDict) and (existingSLDDict[subdomain] == 1): #sys.stdout.write("%s is in existingSLDDict and exists\n" % subdomain) domainToDelList.append(domainsList[index]) #sys.stdout.write("remmove: %s\n" % domainsList[index]) elif (wwwSubdomain in existingSLDDict) and (existingSLDDict[wwwSubdomain] == 1): #sys.stdout.write("%s is in existingSLDDict and exists\n" % subdomain) domainToDelList.append(domainsList[index]) #sys.stdout.write("remmove: %s\n" % domainsList[index]) #else: #sys.stdout.write("%s is in existingSLDDict but doesn't exist\n" % subdomain) else: #sys.stdout.write("%s is not in existingSLDDict\n" % subdomain) if enable2LDProbe == 1: existence1 = domainExistenceCheck(subdomain) existence2 = domainExistenceCheck(wwwSubdomain) if (existence1 == 1) or (existence2 == 1): domainToDelList.append(domainsList[index]) #sys.stdout.write("remmove: %s\n" % domainsList[index]) existingSLDDict[subdomain] = existence1 existingSLDDict[wwwSubdomain] = existence2 for domainToDel in domainToDelList: domainsListReturn.remove(domainToDel) return domainsListReturn # check whether an IP is in local networks def inPrefix(IP, networkPrefixDict): for prefix in networkPrefixDict: if IPAddress(IP) in IPNetwork(prefix): return 1 return 0 # check whether a domain exists def domainExistenceCheck(domain): try: host = socket.gethostbyname(domain) #sys.stdout.write("IP: %s\n" % host) return 1 except socket.gaierror, err: #print "cannot resolve hostname: ", domain, err #sys.stdout.write("IP not found\n") return 0 # this function is currently NOT used def removeDynamicDomains(domainsList, ccTldDict, tldDict, dynamicDomainDict): domainCount = 0 subdomain = "" domainsListReturn = domainsList subdomainList = dict() domainToDelList = list() for index in range(0, len(domainsList)): info = domainsList[index].split(".") (domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domainsList[index], ccTldDict, tldDict) subdomain = domain2LDs if subdomain in subdomainList: subdomainList[subdomain] = subdomainList[subdomain] + 1 else: subdomainList[subdomain] = 1 #sys.stdout.write("subdomain: %s\n" % subdomain) for subdomain in subdomainList: if subdomain in dynamicDomainDict: #sys.stdout.write("dynamic domain: %s\n" % subdomain) continue if subdomainList[subdomain] >= thresholdDynamic: for domain in domainsListReturn: if subdomain in domain: domainToDelList.append(domain) for domainToDel in domainToDelList: domainsListReturn.remove(domainToDel) return domainsListReturn # end of functions for bot detection def hexify(x): "The strings from DNS resolver contain non-ASCII characters - I don't know why. This function investigates that" toHex = lambda x:"".join([hex(ord(c))[2:].zfill(2) for c in x]) return toHex(x) def singleDNSFileDetection(dnsServerDict, networkPrefixDict, inputFile, fpOutput): global timestampPeriodBegin timestampPeriodBegin = 0 domainIP = "" with open(inputFile, 'r') as fpInput: print inputFile offlineHost = Host() for dnsServer in dnsServerDict: sys.stdout.write("DNS Server: %s\n" % dnsServer) for line in fpInput: line = line.strip("\n") info = line.split(" ") if len(info) < 10: continue response = int(info[6]) if response != 1: continue timestamp = float(info[0]) srcIP = info[1] srcPort = info[2] dstIP = info[3] dstPort = info[4] rcode = int(info[7]) qtype = int(info[8]) domain = info[9] if(len(info) == 11): domainIP = info[10] if (srcIP in dnsServerDict) : #if (prefix not in dstIP): if inPrefix(dstIP, networkPrefixDict) == 0: continue ip = dstIP if (dstIP in dnsServerDict) : #if (prefix not in srcIP): if inPrefix(srcIP, networkPrefixDict) == 0: continue ip = srcIP if (srcIP not in dnsServerDict) and (dstIP not in dnsServerDict): continue if timestampPeriodBegin == 0: timestampPeriodBegin = timestamp if(timestamp - timeInterval > timestampPeriodBegin): timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) timestampPeriodBegin = timestampPeriodBegin + timeInterval if ip not in hostDict: newHost = Host() newHost.IP = ip newHost.startTime = timestamp newHost.endTime = timestamp hostDict[ip] = newHost updateResponseDomain(hostDict, domain, qtype, rcode, ip, timestamp, domainIP, fpOutput) def singlePcapDetection(pc, fpOutput, dnsServerDict, networkPrefixDict): global timestampPeriodBegin timestampPeriodBegin = 0 domainIP = "" count = 0 for ts, pkt in pc: count += 1 #test for 1,000,000 packets #if count > 1000000: # break try: eth = dpkt.ethernet.Ethernet(pkt) if eth.type == dpkt.ethernet.ETH_TYPE_IP : ip = eth.data src = ip.src dst = ip.dst time_date = datetime.datetime.fromtimestamp(ts) if ip.p == dpkt.ip.IP_PROTO_UDP : udp = ip.data sport = udp.sport dport = udp.dport data = udp.data elif ip.p == dpkt.ip.IP_PROTO_TCP : tcp = ip.data sport = tcp.sport dport = tcp.dport data = tcp.data else: continue else: continue except: continue if (dport == 53 or sport == 53): time_date = datetime.datetime.fromtimestamp(ts) # some packets have bad DNS data try: dns = dpkt.dns.DNS(data) except: continue # DNS responces if(dns.qr == dpkt.dns.DNS_R) : for qname in dns.qd : for rr in dns.ns: domainIP = extractDomainIP(rr) for rr in dns.an: domainIP = extractDomainIP(rr) for rr in dns.ar: domainIP = extractDomainIP(rr) src_ip = socket.inet_ntoa(src) dst_ip = socket.inet_ntoa(dst) if (src_ip in dnsServerDict) : #if (prefix not in dst_ip): if inPrefix(dst_ip, networkPrefixDict) == 0: continue ip = dst_ip if (dst_ip in dnsServerDict) : #if (prefix not in src_ip): if inPrefix(src_ip, networkPrefixDict) == 0: continue ip = src_ip if (src_ip not in dnsServerDict) and (dst_ip not in dnsServerDict): continue if timestampPeriodBegin == 0: timestampPeriodBegin = ts if(ts - timeInterval > timestampPeriodBegin): timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) timestampPeriodBegin = timestampPeriodBegin + timeInterval if ip not in hostDict: newHost = Host() newHost.IP = ip newHost.startTime = ts newHost.endTime = ts hostDict[ip] = newHost updateResponseDomain(hostDict, qname.name, qname.type, dns.rcode, ip, ts, domainIP, fpOutput) def decodeDname(question, dname): # handle compression i = 0 domain = "" while True: length = ord(dname[i]) print "length: ", length if length > 63: # compression? c_index = (length & 0x3f)*256 + ord(dname[i+1]) print "c_index: ", c_index compressed, ii = decodeDname(question, question[c_index:]) return (domain + compressed).lower(), i+2 if length == 0: break domain += dname[i+1:i+1+length] + "." i += length + 1 #print "DOMAIN: ", domain return domain.lower(), i+1 def extractDomainIP(rr): r_type = rr.type r_data = rr.rdata ip = "" if r_type == dpkt.dns.DNS_A : try: ip = socket.inet_ntoa(r_data) return ip except: return ip return ip def cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict): for ip in hostDict: hostDict[ip].noErrorDict.clear() hostDict[ip].formatErrorDict.clear() hostDict[ip].serverFailDict.clear() hostDict[ip].NXDOMAINDict.clear() hostDict[ip].suspiciousNXDOMAINDict.clear() hostDict[ip].suspiciousNXDOMAIN2LDDict.clear() hostDict[ip].notImplementDict.clear() hostDict[ip].refusedDict.clear() hostDict[ip].suspiciousNXDOMAIN = 0 hostDict[ip].suspiciousNXDOMAIN2LD = 0 hostDict[ip].suspiciousNXDOMAINPeriodDict.clear() del hostDict[ip].suspiciousNXDOMAINList[:] hostDict.clear() def timeoutDetection(hostDict, fpOutput): global enableVerbose NXDomain2LDThreshold = outlierNXDomain2LD(hostDict, fpOutput) for ip in hostDict: if (len(hostDict[ip].suspiciousNXDOMAIN2LDDict.keys()) >= NXDomain2LDThreshold) and (hostDict[ip].labeled == 0): if ip in excludedHostsDict: #sys.stdout.write("host: %s is in the excluded hosts list\n" % ip) continue if enableVerbose == 1: fpOutput.write("Host %s is detected as suspicious based on quantity feature: This host is an outlier in terms of the number of suspicious NXDOMAIN 2LD: %d suspicious NXDOMAIN 2LD\n" % (ip, len(hostDict[ip].suspiciousNXDOMAIN2LDDict.keys())) ) # debug output #if enableVerbose == 1: # fpOutput.write("start time: %s, end time: %s\n" % (hostDict[ip].startTimePeriod, hostDict[ip].endTime)) botDetection(hostDict[ip], fpOutput) def increaseCUSUM(host, localMax, threshold, fpOutput): first = -1 indexIncrease = 0 indexDecrease = 0 count = 0 sumLast = 0 sumCurrent = 0 counterList = list() sorted2LDList = sorted(host.suspiciousNXDOMAIN2LDDict.items(), key=operator.itemgetter(1)) if len(sorted2LDList) < 1: return (-1, -1, -1) timestampFirst = sorted2LDList[0][1] for item in sorted2LDList: # record the number of suspicious 2LD every 60 seconds if item[1] < timestampFirst + 60: count += 1 else: counterList.append(count) count = 1 timestampFirst = item[1] counterList.append(count) for i in range(0, len(counterList)): sumLast = 0 sumCurrent = 0 # check whether the number of suspicious 2LD increases in 5 minutes for j in range(0, 5): if i+j >= len(counterList): break sumCurrent = sumLast + math.fabs(counterList[i+j] - localMax) if (counterList[i+j] > localMax) and (first == -1): first = i+j if sumCurrent > threshold: if enableVerbose: fpOutput.write("increase change detected: offset %d-%d\n" % (first, i+j)) return (first, i+j, sorted2LDList[0][1]+60*first) sumLast = sumCurrent return (-1, -1, -1) def decreaseCUSUM(host, localMax, threshold, increaseEnd, fpOutput): indexIncrease = 0 indexDecrease = 0 decreaseStart = 0 decreaseEnd = 0 count = 0 sumCurrent = 0 timestampLastDomain = 0 counterList = list() sorted2LDList = sorted(host.suspiciousNXDOMAIN2LDDict.items(), key=operator.itemgetter(1)) if len(sorted2LDList) < 1: return (-1, -1, -1) timestampFirst = sorted2LDList[0][1] for item in sorted2LDList: timestampLastDomain = item[1] if item[1] < timestampFirst + 60: count += 1 else: counterList.append(count) count = 1 timestampFirst = item[1] counterList.append(count) if timestampPeriodBegin + timeInterval > timestampLastDomain: counterList.append(0) for i in range(increaseEnd+1, len(counterList)): sumCurrent = 0 for j in range(0, temporalWindow): if i+j >= len(counterList): break sumCurrent = sumCurrent + math.fabs(counterList[i+j] - counterList[i+j-1]) if sumCurrent > threshold: if enableVerbose == 1: fpOutput.write("decrease change detected: offset %d-%d\n" % (i, i+j)) return (i, i+j, sorted2LDList[0][1]+60*(i+j)) return (-1, -1, -1) def sendEmail(emailContents, receiver): SERVER = "localhost" FROM = "admin@botdiggertest.com" TO = list() TO.append(receiver) #TO = ["botdiggeradmin@test.com"] # must be a list SUBJECT = "BotDigger Notice" TEXT = emailContents message = """\From: %s\nTo: %s\nSubject: %s\n\n%s""" % (FROM, ", ".join(TO), SUBJECT, TEXT) try: server = smtplib.SMTP(SERVER) server.sendmail(FROM, TO, message) server.quit() print "Successfully sent email" except: print "Error: unable to send email" def main(argv) : global outputFilePrefix global botDetected global receiver global emailContents global enableEmail global enable2LDProbe global enableVerbose global timeInterval global thresholdSimilarity global thresholdBotsOneCluster totalResponsePkts = 0 totalResponseDomains = 0 totalQueryPkts = 0 totalQueryDomains = 0 prefixFile = "" interface = "" tldListFile = "" bigEnterpriseFile = "" dictionaryFile = "" offlineDomainFile = "" offlineDomainDirectory = "" bigEnterpriseFile = "" configWordsFile = "" popularDomainFile = "" dynamicDomainFile = "" blWebsitesFile = "" resultsFile = "" dnsServerFile = "" excludedDomainsFile = "" sldExistenceFile = "" inputpcapfile = "" inputpcapDir = "" dnsServerDict = dict() networkPrefixDict = dict() ip = "" parser = optparse.OptionParser() parser.add_option("-i", "--interface", action="store", type="string", dest="interface", help="specify the network interface") parser.add_option("-f", "--inputpcap", action="store", type="string", dest="inputpcapfile", help="specify the input pcap file") parser.add_option("-F", "--inputpcapDir", action="store", type="string", dest="inputpcapDir", help="specify the input pcap directory") parser.add_option("-t", "--tld", action="store", type="string", dest="tldListFile", help="specify the file that contains TLDs") parser.add_option("-b", "--blwebsites", action="store", type="string", dest="blWebsitesFile", help="specify the file that contains websites providing blacklist service") parser.add_option("-c", "--configwords", action="store", type="string", dest="configWordsFile", help="specify the file that contains the words to ignore") parser.add_option("-s", "--dnsserver", action="store", type="string", dest="dnsServerFile", help="specify the file that contains IPs of local RDNS") parser.add_option("-p", "--populardomain", action="store", type="string", dest="popularDomainFile", help="specify the file that contains popular domains") parser.add_option("-P", "--prefixFile", action="store", type="string", dest="prefixFile", help="specify the file including local network prefixes (e.g., NetworkPrefixes)") parser.add_option("-d", "--dictionary", action="store", type="string", dest="dictionaryFile", help="specify the file that contains dictionary") parser.add_option("-o", "--offlinefile", action="store", type="string", dest="offlineDomainFile", help="specify the file that contains DNS information") parser.add_option("-O", "--offlinedirectory", action="store", type="string", dest="offlineDomainDirectory", help="specify the directory that contains DNS files") parser.add_option("-n", "--dynamicdomains", action="store", type="string", dest="dynamicDomainFile", help="specify the file that contains dynamic domains") parser.add_option("-e", "--enterprises", action="store", type="string", dest="bigEnterpriseFile", help="specify the file that contains big enterprises") parser.add_option("-x", "--excludedhosts", action="store", type="string", dest="excludedHostsFile", help="specify the file that contains hosts to exclude") parser.add_option("-D", "--excludeddomains", action="store", type="string", dest="excludedDomainsFile", help="specify the file that contains domains to exclude") parser.add_option("-r", "--resultsfile", action="store", type="string", dest="resultsFile", help="specify the output file or directory") parser.add_option("-R", "--receiver", action="store", type="string", dest="receiver", help="specify the email receiver") parser.add_option("-E", "--existingSLD", action="store", type="string", dest="sldExistenceFile", help="specify the file that contains existing SLDs") parser.add_option("-T", "--thresholdSimilarity", action="store", type="float", dest="thresholdSimilarity", help="specify the similarity threshold, default value is 0.1") parser.add_option("-B", "--thresholdBotsOneCluster", action="store", type="int", dest="thresholdBotsOneCluster", help="specify the bot cluster threshold, default value is 4") parser.add_option("-w", "--timeWindow", action="store", type="int", dest="timeInterval", help="specify the time window for bot detection, default value is 600 seconds") parser.add_option("-l", "--enable2LDProbe", action="store_true", dest="enable2LDProbe", default=False, help="enbale 2LD probe, this generates lots of DNS queries, recommand to disable this when running BotDigger in real time") parser.add_option("-v", "--enableVerbose", action="store_true", dest="enableVerbose", default=False, help="verbose mode, analysis information is given for debugging") (options, args) = parser.parse_args() initialize_tables() #default is 0.1 if options.thresholdSimilarity: thresholdSimilarity = options.thresholdSimilarity #default is 4 if options.thresholdBotsOneCluster: thresholdBotsOneCluster = options.thresholdBotsOneCluster #default is 10 minutes if options.timeInterval: timeInterval = options.timeInterval if options.prefixFile: prefixFile = options.prefixFile loadNetworkPrefix(prefixFile, networkPrefixDict) else: parser.error("network prefix not given, use -P") if options.dnsServerFile: dnsServerFile = options.dnsServerFile loadDNSServer(dnsServerDict, dnsServerFile) else: parser.error("DNS server file not given, use -s") if options.tldListFile: tldListFile = options.tldListFile loadKnownTLD(tldDict, ccTldDict, nonCcTldDict, tldListFile) else: parser.error("TLD file not given, use -t") if options.blWebsitesFile: blWebsitesFile = options.blWebsitesFile loadBLWebsites(blWebsitesDict, blWebsitesFile) else: parser.error("blWebsitesFile not given, use -b") if options.configWordsFile: configWordsFile = options.configWordsFile loadConfigWords(configWordsDict, configWordsFile) else: parser.error("configWordsFile not given, use -c") if options.dynamicDomainFile: dynamicDomainFile = options.dynamicDomainFile loadDynamicDomain(dynamicDomainDict, dynamicDomainFile) else: parser.error("dynamicDomainFile not given, use -n") if options.popularDomainFile: popularDomainFile = options.popularDomainFile loadPopularDomain(popularDomainDict, popularDomainFile, tldDict, ccTldDict) else: parser.error("popularDomainFile not given, use -p") if options.bigEnterpriseFile: bigEnterpriseFile = options.bigEnterpriseFile loadBigEnterprises(bigEnterpriseDict, bigEnterpriseFile, tldDict, ccTldDict) else: parser.error("bigEnterpriseFile not given, use -e") if options.dictionaryFile: dictionaryFile = options.dictionaryFile loadDictionary(dictionaryFile, dictionaryDict) else: parser.error("dictionaryFile not given, use -d") if options.excludedHostsFile: excludedHostsFile = options.excludedHostsFile loadExcludedHosts(excludedHostsFile, excludedHostsDict) else: parser.error("excludedHostsFile not given, use -x") if options.excludedDomainsFile: excludedDomainsFile = options.excludedDomainsFile loadExludedDomains(excludedDomainsDict, excludedDomainsFile, tldDict, ccTldDict) else: parser.error("excludedDomainsFile not given, use -D") if options.sldExistenceFile: sldExistenceFile = options.sldExistenceFile loadSLDExistence(sldExistenceFile, existingSLDDict) if options.resultsFile: resultsFile = options.resultsFile else: parser.error("output file/directory not given, use -r") if options.receiver: receiver = options.receiver enableEmail = 1 if options.enable2LDProbe: enable2LDProbe = 1 if options.enableVerbose: enableVerbose = 1 if options.interface: interface = options.interface outputFile = resultsFile with open(outputFile, 'w') as fpOutput: sys.stdout.write("Monitoring the DNS traffic on interface: %s\n" % interface) pc = pcap.pcap(interface) singlePcapDetection(pc, fpOutput, dnsServerDict, networkPrefixDict) timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) #offline detection using pcap files if options.inputpcapfile: inputpcapfile = options.inputpcapfile outputFile = resultsFile outputFilePrefix = resultsFile with open(outputFile, 'w') as fpOutput: with openFile(inputpcapfile, 'rb') as f: pc = dpkt.pcap.Reader(f) singlePcapDetection(pc, fpOutput, dnsServerDict, networkPrefixDict) print inputpcapfile timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) #offline detection using pcap files in a directory if options.inputpcapDir: inputpcapDir = options.inputpcapDir for (dirpath, dirnames, filenames) in walk(inputpcapDir): filenames.sort() for filename in filenames: inputpcapfile = os.path.join(dirpath, filename) outputFile = resultsFile + filename + "-BotResults" print inputpcapfile print outputFile with open(outputFile, 'w') as fpOutput: with openFile(inputpcapfile, 'rb') as f: pc = dpkt.pcap.Reader(f) singlePcapDetection(pc, fpOutput, dnsServerDict, networkPrefixDict) timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) #offline detection using DNS log files if options.offlineDomainDirectory: offlineDomainDirectory = options.offlineDomainDirectory #sys.stdout.write("%s\n", offlineDomainDirectory) for (dirpath, dirnames, filenames) in walk(offlineDomainDirectory): filenames.sort() for filename in filenames: inputFile = os.path.join(dirpath, filename) outputFilePrefix = resultsFile + filename outputFile = resultsFile + filename + "-BotResults" with open(outputFile, 'w') as fpOutput: singleDNSFileDetection(dnsServerDict, networkPrefixDict, inputFile, fpOutput) timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) if options.offlineDomainFile: offlineDomainFile = options.offlineDomainFile outputFile = resultsFile outputFilePrefix = resultsFile with open(outputFile, 'w') as fpOutput: singleDNSFileDetection(dnsServerDict, networkPrefixDict, offlineDomainFile, fpOutput) timeoutDetection(hostDict, fpOutput) cleanHostDict(hostDict, ccTldDict, tldDict, dynamicDomainDict) return if __name__ == "__main__" : main(sys.argv[1:])