# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import os import json import shutil import sys import datetime import csv, math from collections import OrderedDict from utils import Util from components.data.data import Data from components.iana.iana_transform import IanaTransform from components.nc.network_context import NetworkContext import api.resources.hdfs_client as HDFSClient import api.resources.impala_engine as impala from multiprocessing import Process import pandas as pd from impala.util import as_pandas import time import md5 class OA(object): def __init__(self,date,limit=500,logger=None): self._initialize_members(date,limit,logger) def _initialize_members(self,date,limit,logger): # get logger if exists. if not, create new instance. self._logger = logging.getLogger('OA.PROXY') if logger else Util.get_logger('OA.PROXY',create_file=False) # initialize required parameters. self._scrtip_path = os.path.dirname(os.path.abspath(__file__)) self._date = date self._table_name = "proxy" self._proxy_results = [] self._limit = limit self._data_path = None self._ipynb_path = None self._ingest_summary_path = None self._proxy_scores = [] self._proxy_scores_headers = [] self._proxy_extra_columns = [] self._results_delimiter = '\t' # get app configuration. self._spot_conf = Util.get_spot_conf() # get scores fields conf conf_file = "{0}/proxy_conf.json".format(self._scrtip_path) self._conf = json.loads(open (conf_file).read(),object_pairs_hook=OrderedDict) # initialize data engine self._db = self._spot_conf.get('conf', 'DBNAME').replace("'", "").replace('"', '') def start(self): #################### start = time.time() #################### self._create_folder_structure() self._clear_previous_executions() self._add_ipynb() self._get_proxy_results() self._add_reputation() self._add_iana() self._add_network_context() self._create_proxy_scores_csv() self._get_oa_details() self._ingest_summary() ################## end = time.time() print(end - start) ################## def _create_folder_structure(self): # create date folder structure if it does not exist. self._logger.info("Creating folder structure for OA (data and ipynb)") self._data_path,self._ingest_summary_path,self._ipynb_path = Util.create_oa_folders("proxy",self._date) def _clear_previous_executions(self): self._logger.info("Cleaning data from previous executions for the day") yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] table_schema = [] HUSER = self._spot_conf.get('conf', 'HUSER').replace("'", "").replace('"', '') table_schema=['suspicious', 'edge','threat_investigation', 'timeline', 'storyboard', 'summary' ] for path in table_schema: HDFSClient.delete_folder("{0}/{1}/hive/oa/{2}/y={3}/m={4}/d={5}".format(HUSER,self._table_name,path,yr,int(mn),int(dy)),user="impala") impala.execute_query("invalidate metadata") #removes Feedback file HDFSClient.delete_folder("{0}/{1}/scored_results/{2}{3}{4}/feedback/ml_feedback.csv".format(HUSER,self._table_name,yr,mn,dy)) #removes json files from the storyboard HDFSClient.delete_folder("{0}/{1}/oa/{2}/{3}/{4}/{5}".format(HUSER,self._table_name,"storyboard",yr,mn,dy)) def _add_ipynb(self): if os.path.isdir(self._ipynb_path): self._logger.info("Adding advanced mode IPython Notebook") shutil.copy("{0}/ipynb_templates/Advanced_Mode_master.ipynb".format(self._scrtip_path),"{0}/Advanced_Mode.ipynb".format(self._ipynb_path)) self._logger.info("Adding threat investigation IPython Notebook") shutil.copy("{0}/ipynb_templates/Threat_Investigation_master.ipynb".format(self._scrtip_path),"{0}/Threat_Investigation.ipynb".format(self._ipynb_path)) else: self._logger.error("There was a problem adding the IPython Notebooks, please check the directory exists.") def _get_proxy_results(self): self._logger.info("Getting {0} Machine Learning Results from HDFS".format(self._date)) proxy_results = "{0}/proxy_results.csv".format(self._data_path) # get hdfs path from conf file. HUSER = self._spot_conf.get('conf', 'HUSER').replace("'", "").replace('"', '') hdfs_path = "{0}/proxy/scored_results/{1}/scores/proxy_results.csv".format(HUSER,self._date) # get results file from hdfs. get_command = Util.get_ml_results_form_hdfs(hdfs_path,self._data_path) self._logger.info("{0}".format(get_command)) # valdiate files exists if os.path.isfile(proxy_results): # read number of results based in the limit specified. self._logger.info("Reading {0} proxy results file: {1}".format(self._date,proxy_results)) self._proxy_results = Util.read_results(proxy_results,self._limit,self._results_delimiter)[:] if len(self._proxy_results) == 0: self._logger.error("There are not proxy results.");sys.exit(1) else: self._logger.error("There was an error getting ML results from HDFS") sys.exit(1) self._proxy_scores = self._proxy_results[:] def _create_proxy_scores_csv(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] value_string = "" for row in self._proxy_scores: value_string += str(tuple(Util.cast_val(item) for item in row)) + "," load_into_impala = (""" INSERT INTO {0}.proxy_scores partition(y={2}, m={3}, d={4}) VALUES {1} """).format(self._db, value_string[:-1], yr, mn, dy) impala.execute_query(load_into_impala) def _add_reputation(self): # read configuration. reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) self._logger.info("Reading reputation configuration file: {0}".format(reputation_conf_file)) rep_conf = json.loads(open(reputation_conf_file).read()) # initialize reputation services. self._rep_services = [] self._logger.info("Initializing reputation services.") for service in rep_conf: config = rep_conf[service] module = __import__("components.reputation.{0}.{0}".format(service), fromlist=['Reputation']) self._rep_services.append(module.Reputation(config,self._logger)) # get columns for reputation. rep_cols = {} indexes = [ int(value) for key, value in self._conf["add_reputation"].items()] self._logger.info("Getting columns to add reputation based on config file: proxy_conf.json".format()) for index in indexes: col_list = [] for conn in self._proxy_scores: col_list.append(conn[index]) rep_cols[index] = list(set(col_list)) # get reputation per column. self._logger.info("Getting reputation for each service in config") rep_services_results = [] if self._rep_services : for key,value in rep_cols.items(): rep_services_results = [ rep_service.check(None,value,True) for rep_service in self._rep_services] rep_results = {} for result in rep_services_results: rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)} if rep_results: self._proxy_scores = [ conn + [ rep_results[conn[key]] ] for conn in self._proxy_scores ] else: self._proxy_scores = [ conn + [""] for conn in self._proxy_scores ] else: self._proxy_scores = [ conn + [""] for conn in self._proxy_scores ] def _add_iana(self): iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if os.path.isfile(iana_conf_file): iana_config = json.loads(open(iana_conf_file).read()) proxy_iana = IanaTransform(iana_config["IANA"]) proxy_rcode_index = self._conf["proxy_score_fields"]["respcode"] self._proxy_scores = [ conn + [proxy_iana.get_name(conn[proxy_rcode_index],"proxy_http_rcode")] for conn in self._proxy_scores ] else: self._proxy_scores = [ conn + [""] for conn in self._proxy_scores ] def _add_network_context(self): nc_conf_file = "{0}/components/nc/nc_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if os.path.isfile(nc_conf_file): nc_conf = json.loads(open(nc_conf_file).read())["NC"] proxy_nc = NetworkContext(nc_conf,self._logger) ip_dst_index = self._conf["proxy_score_fields"]["clientip"] self._proxy_scores = [ conn + [proxy_nc.get_nc(conn[ip_dst_index])] for conn in self._proxy_scores ] else: self._proxy_scores = [ conn + [""] for conn in self._proxy_scores ] def _get_oa_details(self): self._logger.info("Getting OA Proxy suspicious details") # start suspicious connects details process. p_sp = Process(target=self._get_suspicious_details) p_sp.start() def _get_suspicious_details(self): uri_list = [] iana_conf_file = "{0}/components/iana/iana_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) if os.path.isfile(iana_conf_file): iana_config = json.loads(open(iana_conf_file).read()) proxy_iana = IanaTransform(iana_config["IANA"]) for conn in self._proxy_scores: clientip = conn[self._conf["proxy_score_fields"]["clientip"]] fulluri = conn[self._conf["proxy_score_fields"]["fulluri"]] date=conn[self._conf["proxy_score_fields"]["p_date"]].split('-') if len(date) == 3: year=date[0] month=date[1].zfill(2) day=date[2].zfill(2) hh=(conn[self._conf["proxy_score_fields"]["p_time"]].split(":"))[0] self._get_proxy_details(fulluri,clientip,year,month,day,hh,proxy_iana) def _get_proxy_details(self,fulluri,clientip,year,month,day,hh,proxy_iana): limit = 250 value_string = "" query_to_load =(""" SELECT p_date, p_time, clientip, host, webcat, respcode, reqmethod, useragent, resconttype, referer, uriport, serverip, scbytes, csbytes, fulluri, {5} as hh FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}' AND h='{5}' AND fulluri='{6}' AND clientip='{7}' LIMIT {8}; """).format(self._db,self._table_name, year,month,day,hh,fulluri.replace("'","\\'"),clientip,limit) detail_results = impala.execute_query(query_to_load) if proxy_iana: # add IANA to results. self._logger.info("Adding IANA translation to details results") updated_rows = [conn + (proxy_iana.get_name(conn[5],"proxy_http_rcode"),) for conn in detail_results] updated_rows = filter(None, updated_rows) else: updated_rows = [conn + ("") for conn in detail_results ] for row in updated_rows: value_string += str(tuple(item for item in row)) + "," if value_string != "": query_to_insert=(""" INSERT INTO {0}.proxy_edge PARTITION (y={1}, m={2}, d={3}) VALUES ({4}); """).format(self._db,year, month, day, value_string[:-1]) impala.execute_query(query_to_insert) def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date","total"] result_rows = [] df_filtered = pd.DataFrame() # get ingest summary. query_to_load=(""" SELECT p_date, p_time, COUNT(*) as total FROM {0}.{1} WHERE y='{2}' AND m='{3}' AND d='{4}' AND p_date IS NOT NULL AND p_time IS NOT NULL AND clientip IS NOT NULL AND p_time != '' AND host IS NOT NULL AND fulluri IS NOT NULL GROUP BY p_date, p_time; """).format(self._db,self._table_name, yr, mn, dy) results = impala.execute_query(query_to_load) if results: df_results = as_pandas(results) #Forms a new dataframe splitting the minutes from the time column/ df_new = pd.DataFrame([["{0} {1}:{2}".format(val['p_date'], val['p_time'].split(":")[0].zfill(2), val['p_time'].split(":")[1].zfill(2)), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols) value_string = '' #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) if len(df_final) > 0: query_to_insert=(""" INSERT INTO {0}.proxy_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert) else: self._logger.info("No data found for the ingest summary")