#/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import time import logging import os import json from multiprocessing import Process from common.utils import Util from common import hdfs_client as hdfs from common.hdfs_client import HdfsException from common.file_collector import FileWatcher from multiprocessing import Pool class Collector(object): def __init__(self, hdfs_app_path, kafkaproducer, conf_type): self._initialize_members(hdfs_app_path, kafkaproducer, conf_type) def _initialize_members(self, hdfs_app_path, kafkaproducer, conf_type): # getting parameters. self._logger = logging.getLogger('SPOT.INGEST.DNS') self._hdfs_app_path = hdfs_app_path self._producer = kafkaproducer # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read dns configuration. conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._conf = conf["pipelines"][conf_type] # set configuration. self._collector_path = self._conf['collector_path'] self._dsource = 'dns' self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource) # set configuration. self._pkt_num = self._conf['pkt_num'] self._pcap_split_staging = self._conf['pcap_split_staging'] self._supported_files = self._conf['supported_files'] # create collector watcher self._watcher = FileWatcher(self._collector_path, self._supported_files) # Multiprocessing. self._processes = conf["collector_processes"] self._ingestion_interval = conf["ingestion_interval"] self._pool = Pool(processes=self._processes) # TODO: review re-use of hdfs.client self._hdfs_client = hdfs.get_client() def start(self): self._logger.info("Starting DNS ingest") self._watcher.start() try: while True: self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping DNS collector...") Util.remove_kafka_topic(self._producer.Zookeeper, self._producer.Topic, self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() SystemExit("Ingest finished...") def _ingest_files_pool(self): if self._watcher.HasFiles: for x in range(0, self._processes): self._logger.info('processes: {0}'.format(self._processes)) new_file = self._watcher.GetNextFile() if self._processes <= 1: _ingest_file( self._hdfs_client, new_file, self._pkt_num, self._pcap_split_staging, self._hdfs_root_path, self._producer, self._producer.Topic ) else: resutl = self._pool.apply_async(_ingest_file, args=( self._hdfs_client, new_file, self._pkt_num, self._pcap_split_staging, self._hdfs_root_path, self._producer, self._producer.Topic )) # resutl.get() # to debug add try and catch. if not self._watcher.HasFiles: break return True def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging, hdfs_root_path, producer, topic): logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid())) try: # get file name and date. org_file = new_file file_name_parts = new_file.split('/') file_name = file_name_parts[len(file_name_parts)-1] # split file. name = file_name.split('.')[0] split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num, new_file, pcap_split_staging, name) logger.info("Splitting file: {0}".format(split_cmd)) Util.execute_cmd(split_cmd,logger) logger.info("Removing file: {0}".format(org_file)) rm_big_file = "rm {0}".format(org_file) Util.execute_cmd(rm_big_file,logger) except Exception as err: logger.error("There was a problem splitting the file: {0}".format(err.message)) logger.error("Exception: {0}".format(err)) for currdir, subdir, files in os.walk(pcap_split_staging): for file in files: if file.endswith(".pcap") and "{0}_spot".format(name) in file: # get timestamp from the file name to build hdfs path. file_date = file.split('.')[0] pcap_hour = file_date[-6:-4] pcap_date_path = file_date[-14:-6] # hdfs path with timestamp. hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, pcap_date_path, pcap_hour) # create hdfs path. try: if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0: logger.info('creating directory: ' + hdfs_path) hdfs_client.mkdir(hdfs_path, hdfs_client) # load file to hdfs. hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file) result = hdfs_client.upload_file(hadoop_pcap_file, os.path.join(currdir,file)) if not result: logger.error('File failed to upload: ' + hadoop_pcap_file) raise HdfsException # create event for workers to process the file. logger.info( "Sending split file to Topic: {0}".format(topic)) producer.SendMessage(hadoop_pcap_file, topic) logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic)) except HdfsException as err: logger.error('Exception: ' + err.exception) logger.info('Check Hdfs Connection settings and server health') except Exception as err: logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format(new_file,topic)) logger.error("Error: {0}".format(err))