#! /usr/bin/env python import ConfigParser import csv import datetime as dt import dnsdb_query import json import pygeoip import re import sys from netaddr import IPAddress, IPRange, IPSet from sortedcontainers import SortedDict from logger import get_logger logger = get_logger('winnower') # from http://en.wikipedia.org/wiki/Reserved_IP_addresses: reserved_ranges = IPSet(['0.0.0.0/8', '100.64.0.0/10', '127.0.0.0/8', '192.88.99.0/24', '198.18.0.0/15', '198.51.100.0/24', '203.0.113.0/24', '233.252.0.0/24']) gi_org = SortedDict() geo_data = pygeoip.GeoIP('data/GeoIP.dat', pygeoip.MEMORY_CACHE) def load_gi_org(filename): with open(filename, 'rb') as f: org_reader = csv.DictReader(f, fieldnames=['start', 'end', 'org']) for row in org_reader: gi_org[row['start']] = (IPRange(row['start'], row['end']), unicode(row['org'], errors='replace')) return gi_org def org_by_addr(address): as_num = None as_name = None gi_index = gi_org.bisect(str(int(address))) gi_net = gi_org[gi_org.iloc[gi_index - 1]] if address in gi_net[0]: as_num, sep, as_name = gi_net[1].partition(' ') as_num = as_num.replace("AS", "") # Making sure the variable only has the number return as_num, as_name def maxhits(dns_records): hmax = 0 hostname = None for record in dns_records: #logger.info("examining %s" % record) if record['count'] > hmax: hmax = record['count'] hostname = record['rrname'].rstrip('.') return hostname def maxhits_rdata(dns_records): hmax = 0 hostname = None for record in dns_records: # logger.info("Examining %s" % record) if record['count'] > hmax: hmax = record['count'] hostname = record['rdata'][0].rstrip('.') return hostname def enrich_IPv4(address, dnsdb=None, hostname=None): as_num, as_name = org_by_addr(address) country = geo_data.country_code_by_addr('%s' % address) if dnsdb: inaddr = address.reverse_dns rhost = maxhits_rdata(dnsdb.query_rrset('%s' % inaddr)) else: rhost = None return (as_num, as_name, country, hostname, rhost) def enrich_FQDN(address, date, dnsdb): records = dnsdb.query_rrset(address, rrtype='A') yesterday = dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(days=1) yesterday_str = yesterday.strftime('%Y-%m-%d') records = filter_date(records, yesterday_str) enrichment = [] if not records: return None for ip_addr in records[0]['rdata']: ip_addr_data = enrich_IPv4(IPAddress(ip_addr), dnsdb, address) enrichment.append((ip_addr,) + ip_addr_data) return enrichment def filter_date(records, date): date_dt = dt.datetime.strptime(date, '%Y-%m-%d') start_dt = dt.datetime.combine(date_dt, dt.time.min).strftime('%Y-%m-%d %H:%M:%S') end_dt = dt.datetime.combine(date_dt, dt.time.max).strftime('%Y-%m-%d %H:%M:%S') return dnsdb_query.filter_before(dnsdb_query.filter_after(records, start_dt), end_dt) def reserved(address): a_reserved = address.is_reserved() a_private = address.is_private() a_inr = address in reserved_ranges if a_reserved or a_private or a_inr: return True else: return False def is_ipv4(address): if re.match('(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$', address): return True else: return False def is_fqdn(address): if re.match('(?=^.{4,255}$)(^((?!-)[a-zA-Z0-9-]{1,63}(?<!-)\.)+[a-zA-Z]{2,63}$)', address): return True else: return False def winnow(in_file, out_file, enr_file): config = ConfigParser.SafeConfigParser(allow_no_value=True) cfg_success = config.read('combine.cfg') if not cfg_success: logger.error('Winnower: Could not read combine.cfg.') logger.error('HINT: edit combine-example.cfg and save as combine.cfg.') return server = config.get('Winnower', 'dnsdb_server') api = config.get('Winnower', 'dnsdb_api') enrich_ip = config.get('Winnower', 'enrich_ip') if enrich_ip == '1' or enrich_ip == 'True': enrich_ip = True logger.info('Enriching IPv4 indicators: TRUE') else: enrich_ip = False logger.info('Enriching IPv4 indicators: FALSE') enrich_dns = config.get('Winnower', 'enrich_dns') if enrich_dns == '1' or enrich_dns == 'True': enrich_dns = True logger.info('Enriching DNS indicators: TRUE') else: enrich_dns = False logger.info('Enriching DNS indicators: FALSE') logger.info('Setting up DNSDB client') # handle the case where we aren't using DNSDB dnsdb = dnsdb_query.DnsdbClient(server, api) if api == 'YOUR_API_KEY_HERE' or len(dnsdb.query_rdata_name('google.com')) == 0: dnsdb = None logger.info('Invalid DNSDB configuration found') with open(in_file, 'rb') as f: crop = json.load(f) # TODO: make these locations configurable? logger.info('Loading GeoIP data') gi_org = load_gi_org('data/GeoIPASNum2.csv') wheat = [] enriched = [] logger.info('Beginning winnowing process') for each in crop: (addr, addr_type, direction, source, note, date) = each # this should be refactored into appropriate functions if addr_type == 'IPv4' and is_ipv4(addr): #logger.info('Enriching %s' % addr) ipaddr = IPAddress(addr) if not reserved(ipaddr): wheat.append(each) if enrich_ip: e_data = (addr, addr_type, direction, source, note, date) + enrich_IPv4(ipaddr, dnsdb) enriched.append(e_data) else: e_data = (addr, addr_type, direction, source, note, date) + enrich_IPv4(ipaddr) enriched.append(e_data) else: logger.error('Found invalid address: %s from: %s' % (addr, source)) elif addr_type == 'FQDN' and is_fqdn(addr): #logger.info('Enriching %s' % addr) wheat.append(each) if enrich_dns and dnsdb: # print "Enriching %s" % addr e_data = enrich_FQDN(addr, date, dnsdb) if e_data: for each in e_data: datum = (each[0], "IPv4", direction, source, note, date) + each[1:] enriched.append(datum) else: logger.error('Could not determine address type for %s listed as %s' % (addr, addr_type)) logger.info('Dumping results') with open(out_file, 'wb') as f: w_data = json.dumps(wheat, indent=2, ensure_ascii=False).encode('utf8') f.write(w_data) with open(enr_file, 'wb') as f: e_data = json.dumps(enriched, indent=2, ensure_ascii=False).encode('utf8') f.write(e_data) if __name__ == "__main__": winnow('crop.json', 'crop.json', 'enriched.json')