python source code of collector

incubator-spot-master
- spot-ingest
  - common
    - configurator.py
    - hdfs_client.py
    - serializer.py
    - listener.py
    - hive_engine.py
    - file_collector.py
    - file_watcher.py
    - kafka_topic.sh
    - __init__.py
    - utils.py
    - kafka_client.py
    - producer.py
    - kerberos.py
  - collector.py
  - kerberos-requirements.txt
  - master_collector.py
  - install_DC.sh
  - setup.py
  - pipelines
    - proxy
      - streaming.py
      - collector.py
      - processing.py
      - bluecoat.py
      - __init__.py
      - worker.py
    - __init__.py
    - dns
      - load_dns_avro_parquet.hql
      - streaming.py
      - collector.py
      - processing.py
      - __init__.py
      - worker.py
    - flow
      - load_flow_avro_parquet.hql
      - streaming.py
      - collector.py
      - processing.py
      - __init__.py
      - worker.py
  - README.md
  - requirements.txt
  - worker.py
  - ingest_conf.json
  - start_listener.py
  - streaming-requirements.txt
  - start_ingest_standalone.sh
- issues-subscribe@spot.incubator.apache.org
- spot-oa
  - ipython
    - extensions
      - spot_webapp.py
      - __init__.py
    - README
    - profile_spot
      - ipython_notebook_config.py
      - startup
        README
        graphql.py
      - static
        custom
        ajax-loader.gif
        custom.js
        custom.css
  - ui
    - images
      - ajax-loader.gif
      - favicon
        browserconfig.xml
        favicon.ico
        manifest.json
    - proxy
      - ipython_notebook.html
      - suspicious.html
      - threat-investigation.html
      - storyboard.html
      - package.json
      - js
        notebooks.js
        threat-investigation.js
        components
        SuspiciousPanel.react.js
        DetailsPanel.react.js
        ScoreNotebook.react.js
        FilterInput.react.js
        TimelinePanel.react.js
        NetworkViewPanel.react.js
        IncidentProgressionPanel.react.js
        stores
        CommentsStore.js
        DetailsStore.js
        NotebookStore.js
        IncidentProgressionStore.js
        SuspiciousStore.js
        TimelineStore.js
        storyboard.js
        suspicious.js
    - .babelrc
    - dns
      - ipython_notebook.html
      - suspicious.html
      - threat-investigation.html
      - storyboard.html
      - README.md
      - package.json
      - js
        notebooks.js
        threat-investigation.js
        components
        DetailsDendrogramPanel.react.js
        SuspiciousPanel.react.js
        DetailsPanel.react.js
        ScoreNotebook.react.js
        FilterInput.react.js
        NetworkViewPanel.react.js
        DetailsTablePanel.react.js
        IncidentProgressionPanel.react.js
        stores
        DendrogramStore.js
        CommentsStore.js
        DetailsStore.js
        NotebookStore.js
        IncidentProgressionStore.js
        SuspiciousStore.js
        storyboard.js
        suspicious.js
    - README.md
    - ingest
      - ingest-summary.html
    - package.json
    - js
      - dispatchers
        SpotDispatcher.js
      - utils
        __tests__
        CategoryLayout.test.js
        DateUtils.test.js
        SpotUtils.test.js
        SpotUtils.js
        CategoryLayout.js
        DateUtils.js
      - actions
        __tests__
        EdInActions.test.js
        StoryboardActions.test.js
        SpotActions.test.js
        InSumActions.js
        EdInActions.js
        StoryboardActions.js
        SpotActions.js
      - menu
        components
        MainMenu.react.js
        stores
        MainMenuStore.js
        menu.js
      - components
        OptionPicker.react.js
        IPythonNotebookPanel.react.js
        PanelRow.react.js
        DateInput.react.js
        TimelineMixin.react.js
        scoring
        RatingInput.react.js
        FilterSelectInput.react.js
        SearchGlobalInput.react.js
        ScoreMessage.react.js
        SelectInput.react.js
        ButtonsScoring.react.js
        ExecutiveThreatBriefingPanel.react.js
        PolloNetworkViewMixin.react.js
        ContentLoaderMixin.react.js
        IngestSummaryPanel.react.js
        Panel.react.js
        DetailsGridMixin.react.js
        GridPanelMixin.react.js
        ChartMixin.react.js
        DendrogramMixin.react.js
        SuspiciousGridMixin.react.js
      - tooltip.js
      - stores
        JsonStore.js
        __tests__
        JsonStore.test.js
        RestStore.test.js
        SpotStore.test.js
        SpotStore.js
        GraphQLStore.js
        RestStore.js
        ObservableWithHeadersGraphQLStore.js
        ObservableGraphQLStore.js
        IngestSummaryStore.js
      - constants
        SpotConstants.js
      - cubehelix
        cubehelix.js
      - ingest-summary.js
    - INSTALL.md
    - css
      - bootstrap-spot-variables.less
      - main.css
      - bootstrap-spot-theme.less
      - bootstrap-spot.min.css
      - dendrogram.css
    - flow
      - ipython_notebook.html
      - suspicious.html
      - threat-investigation.html
      - storyboard.html
      - package.json
      - js
        notebooks.js
        threat-investigation.js
        lib
        topojson.min.js
        ChordMapper.js
        components
        DetailsChordsPanel.react.js
        SuspiciousPanel.react.js
        DetailsPanel.react.js
        GlobeViewPanel.react.js
        ScoreNotebook.react.js
        FilterInput.react.js
        TimelinePanel.react.js
        NetworkViewPanel.react.js
        DetailsTablePanel.react.js
        ImpactAnalysisPanel.react.js
        IncidentProgressionPanel.react.js
        stores
        GlobeViewStore.js
        CommentsStore.js
        ImpactAnalysisStore.js
        DetailsStore.js
        NotebookStore.js
        IncidentProgressionStore.js
        SuspiciousStore.js
        TimelineStore.js
        ChordsDiagramStore.js
        constants
        NetflowConstants.js
        storyboard.js
        suspicious.js
      - world-110m.json
  - api
    - resources
      - configurator.py
      - hdfs_client.py
      - dns.py
      - __init__.py
      - README.md
      - impala_engine.py
      - proxy.py
      - flow.py
    - graphql
      - proxy
        query.py
        mutation.py
        __init__.py
      - schema.py
      - webapp.py
      - common.py
      - __init__.py
      - dns
        query.py
        mutation.py
        __init__.py
      - README.md
      - flow
        query.py
        mutation.py
        __init__.py
    - __init__.py
  - context
    - README.md
  - kerberos-requirements.txt
  - oa
    - proxy
      - proxy_conf.json
      - proxy_oa.py
      - __init__.py
      - README.md
      - ipynb_templates
        Advanced_Mode_master.ipynb
        Threat_Investigation_master.ipynb
        ThreatInvestigation.md
    - components
      - iana
        iana_config.json
        http-rcode.csv
        dns-qtype.csv
        dns-rcode.csv
        dns-qclass.csv
        iana_transform.py
        __init__.py
      - geoloc
        geoloc.py
        __init__.py
      - reputation
        fb
        fb.py
        __init__.py
        reputation_config.json
        __init__.py
        README.md
        gti
        __init__.py
        gti.py
      - nc
        network_context.py
        __init__.py
        nc_config.json
      - data
        data.py
        engine.json
        hive.py
        impala.py
        __init__.py
      - __init__.py
      - README.md
    - start_oa.py
    - dns
      - dns_conf.json
      - dns_oa.py
      - __init__.py
      - README.md
      - ipynb_templates
        Advanced_Mode_master.ipynb
        Threat_Investigation_master.ipynb
        ThreatInvestigation.md
    - utils.py
    - INSTALL.md
    - flow
      - flow_oa.py
      - __init__.py
      - README.md
      - flow_conf.json
      - ipynb_templates
        Advanced_Mode_master.ipynb
        Threat_Investigation_master.ipynb
        ThreatInvestigation.md
  - __init__.py
  - README.md
  - runIpython.sh
  - requirements.txt
- .gitmodules
- LICENSE
- spot-ml
  - src
    - main
      - scala
        org
        apache
        spot
        netflow
        FlowWordCreator.scala
        FlowSuspiciousConnectsAnalysis.scala
        FlowSchema.scala
        model
        FlowFeedback.scala
        FlowSuspiciousConnectsModel.scala
        FlowScoreFunction.scala
        proxy
        ProxyWordCreation.scala
        ProxySuspiciousConnectsAnalysis.scala
        ProxySuspiciousConnectsModel.scala
        ProxyFeedback.scala
        ProxySchema.scala
        SuspiciousConnectsArgumentParser.scala
        SuspiciousConnectsScoreFunction.scala
        SuspiciousConnects.scala
        dns
        DNSWordCreation.scala
        DNSSchema.scala
        model
        DNSSuspiciousConnectsModel.scala
        DNSScoreFunction.scala
        DNSFeedback.scala
        DNSSuspiciousConnectsAnalysis.scala
        utilities
        FloatPointPrecisionUtility.scala
        CountryCodes.scala
        DomainProcessor.scala
        Entropy.scala
        TimeUtilities.scala
        MathUtils.scala
        data
        InputOutputDataHandler.scala
        validation
        InputSchema.scala
        InvalidDataHandler.scala
        TopDomains.scala
        lda
        SpotLDAWrapper.scala
        SpotLDAWrapperSchema.scala
        SpotLDAModel.scala
        SpotLDAResult.scala
        SpotLDAHelper.scala
        spark
        sql
        WideUDFs.scala
    - test
      - resources
        log4j.properties
      - scala
        org
        apache
        spot
        testutils
        TestingSparkContextFunSuite.scala
        TestingSparkContextFlatSpec.scala
        TestingSparkContext.scala
        TestingSparkContextWordSpec.scala
        netflow
        FlowWordCreatorTest.scala
        FlowRecord.scala
        FlowSuspiciousConnectsAnalysisTest.scala
        model
        FlowSuspiciousConnectsModelTest.scala
        proxy
        ProxyWordCreationTest.scala
        ProxySuspiciousConnectsAnalysisTest.scala
        SuspiciousConnectsScoreFunctionTest.scala
        SuspiciousConnectsArgumentParserTest.scala
        dns
        DNSWordCreationTest.scala
        DNSSuspiciousConnectsModelTest.scala
        DNSSuspiciousConnectsAnalysisTest.scala
        utilities
        FloatPointPrecisionUtility32Test.scala
        FloatingPointUtility64.scala
        data
        validation
        InputSchemaTest.scala
        MathUtilsTest.scala
        DomainProcessorTest.scala
        TimeUtilitiesTest.scala
        lda
        SpotLDAWrapperTest.scala
        SpotLDAHelperTest.scala
  - ml_test.sh
  - SPARKCONF.md
  - SPOT-ML-JAR.md
  - project
    - build.properties
    - plugins.sbt
  - ML_OPS.md
  - ml_ops.sh
  - README.md
  - build.sbt
  - INSTALL.md
  - DATA_SAMPLE.md
  - SUSPICIOUS_CONNECTS_SCHEMA.md
- dev
  - release
    - README.md
    - .rat-excludes
- DISCLAIMER
- README.md
- NOTICE
- .gitignore
- docs
- spot-setup
  - KERBEROS.md
  - hdfs_setup.sh
  - beeline
    - create_dns_parquet.hql
    - create_flow_parquet.hql
    - create_proxy_parquet.hql
  - migrate_to_spot_1_0.py
  - migration
    - drop_flow_migration_tables.hql
    - utilities.py
    - migrate_old_proxy_data.py
    - drop_proxy_migration_tables.hql
    - migrate_old_dns_data.py
    - migrate_old_flow_data.py
    - create_flow_migration_tables.hql
    - spot_conf_migration.py
    - drop_dns_migration_tables.hql
    - __init__.py
    - README.md
    - create_dns_migration_tables.hql
    - create_proxy_migration_tables.hql
  - impala
    - create_dns_parquet.hql
    - create_flow_parquet.hql
    - create_proxy_parquet.hql
  - README.md
  - hive
    - create_dns_parquet.hql
    - create_flow_parquet.hql
    - create_proxy_parquet.hql
  - APACHE-SPOT-SCHEMA.md
  - spot.conf

#/bin/env python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import time
import logging
import os
import json
from multiprocessing import Process
from common.utils import Util
from common import hdfs_client as hdfs
from common.hdfs_client import HdfsException
from common.file_collector import FileWatcher
from multiprocessing import Pool


class Collector(object):

    def __init__(self, hdfs_app_path, kafkaproducer, conf_type):

        self._initialize_members(hdfs_app_path, kafkaproducer, conf_type)

    def _initialize_members(self, hdfs_app_path, kafkaproducer, conf_type):

        # getting parameters.
        self._logger = logging.getLogger('SPOT.INGEST.DNS')
        self._hdfs_app_path = hdfs_app_path
        self._producer = kafkaproducer

        # get script path
        self._script_path = os.path.dirname(os.path.abspath(__file__))

        # read dns configuration.
        conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path)))
        conf = json.loads(open(conf_file).read())
        self._conf = conf["pipelines"][conf_type]

        # set configuration.
        self._collector_path = self._conf['collector_path']
        self._dsource = 'dns'
        self._hdfs_root_path = "{0}/{1}".format(hdfs_app_path, self._dsource)

        # set configuration.
        self._pkt_num = self._conf['pkt_num']
        self._pcap_split_staging = self._conf['pcap_split_staging']
        self._supported_files = self._conf['supported_files']

        # create collector watcher
        self._watcher = FileWatcher(self._collector_path, self._supported_files)

        # Multiprocessing.
        self._processes = conf["collector_processes"]
        self._ingestion_interval = conf["ingestion_interval"]
        self._pool = Pool(processes=self._processes)
        # TODO: review re-use of hdfs.client
        self._hdfs_client = hdfs.get_client()

    def start(self):

        self._logger.info("Starting DNS ingest")
        self._watcher.start()

        try:
            while True:
                self._ingest_files_pool()
                time.sleep(self._ingestion_interval)
        except KeyboardInterrupt:
            self._logger.info("Stopping DNS collector...")
            Util.remove_kafka_topic(self._producer.Zookeeper, self._producer.Topic, self._logger)
            self._watcher.stop()
            self._pool.terminate()
            self._pool.close()
            self._pool.join()
            SystemExit("Ingest finished...")

    def _ingest_files_pool(self):

        if self._watcher.HasFiles:

            for x in range(0, self._processes):
                self._logger.info('processes: {0}'.format(self._processes))
                new_file = self._watcher.GetNextFile()
                if self._processes <= 1:
                    _ingest_file(
                        self._hdfs_client,
                        new_file,
                        self._pkt_num,
                        self._pcap_split_staging,
                        self._hdfs_root_path,
                        self._producer,
                        self._producer.Topic
                        )
                else:
                    resutl = self._pool.apply_async(_ingest_file, args=(
                        self._hdfs_client,
                        new_file,
                        self._pkt_num,
                        self._pcap_split_staging,
                        self._hdfs_root_path,
                        self._producer,
                        self._producer.Topic
                        ))
                # resutl.get() # to debug add try and catch.
                if not self._watcher.HasFiles:
                    break
        return True


def _ingest_file(hdfs_client, new_file, pkt_num, pcap_split_staging, hdfs_root_path, producer, topic):

    logger = logging.getLogger('SPOT.INGEST.DNS.{0}'.format(os.getpid()))
    
    try:
        # get file name and date.
        org_file = new_file
        file_name_parts = new_file.split('/')
        file_name = file_name_parts[len(file_name_parts)-1]

        # split file.
        name = file_name.split('.')[0]
        split_cmd = "editcap -c {0} {1} {2}/{3}_spot.pcap".format(pkt_num,
                                                                  new_file,
                                                                  pcap_split_staging,
                                                                  name)
        logger.info("Splitting file: {0}".format(split_cmd))
        Util.execute_cmd(split_cmd,logger)

        logger.info("Removing file: {0}".format(org_file))
        rm_big_file = "rm {0}".format(org_file)
        Util.execute_cmd(rm_big_file,logger)

    except Exception as err:
        logger.error("There was a problem splitting the file: {0}".format(err.message))
        logger.error("Exception: {0}".format(err))

    for currdir, subdir, files in os.walk(pcap_split_staging):
        for file in files:
            if file.endswith(".pcap") and "{0}_spot".format(name) in file:
                # get timestamp from the file name to build hdfs path.
                file_date = file.split('.')[0]
                pcap_hour = file_date[-6:-4]
                pcap_date_path = file_date[-14:-6]

                # hdfs path with timestamp.
                hdfs_path = "{0}/binary/{1}/{2}".format(hdfs_root_path, pcap_date_path, pcap_hour)

                # create hdfs path.
                try:
                    if len(hdfs.list_dir(hdfs_path, hdfs_client)) == 0:
                        logger.info('creating directory: ' + hdfs_path)
                        hdfs_client.mkdir(hdfs_path, hdfs_client)

                    # load file to hdfs.
                    hadoop_pcap_file = "{0}/{1}".format(hdfs_path,file)
                    result = hdfs_client.upload_file(hadoop_pcap_file, os.path.join(currdir,file))
                    if not result:
                        logger.error('File failed to upload: ' + hadoop_pcap_file)
                        raise HdfsException

                    # create event for workers to process the file.
                    logger.info( "Sending split file to Topic: {0}".format(topic))
                    producer.SendMessage(hadoop_pcap_file, topic)
                    logger.info("File {0} has been successfully sent to Kafka Topic to: {1}".format(file,topic))

                except HdfsException as err:
                    logger.error('Exception: ' + err.exception)
                    logger.info('Check Hdfs Connection settings and server health')

                except Exception as err:
                    logger.info("File {0} failed to be sent to Kafka Topic to: {1}".format(new_file,topic))
                    logger.error("Error: {0}".format(err))