#!/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import json import os import sys import copy from common.utils import Util, NewFileEvent from common.kafka_client import KafkaProducer from multiprocessing import Pool from common.file_collector import FileWatcher import time class Collector(object): def __init__(self,hdfs_app_path,kafka_topic,conf_type): self._initialize_members(hdfs_app_path,kafka_topic,conf_type) def _initialize_members(self,hdfs_app_path,kafka_topic,conf_type): # getting parameters. self._logger = logging.getLogger('SPOT.INGEST.PROXY') self._hdfs_app_path = hdfs_app_path self._kafka_topic= kafka_topic # get script path self._script_path = os.path.dirname(os.path.abspath(__file__)) # read proxy configuration. conf_file = "{0}/ingest_conf.json".format(os.path.dirname(os.path.dirname(self._script_path))) conf = json.loads(open(conf_file).read()) self._message_size = conf["kafka"]["message_size"] self._conf = conf["pipelines"][conf_type] # get collector path. self._collector_path = self._conf['collector_path'] #get supported files self._supported_files = self._conf['supported_files'] # create collector watcher self._watcher = FileWatcher(self._collector_path,self._supported_files) # Multiprocessing. self._processes = conf["collector_processes"] self._ingestion_interval = conf["ingestion_interval"] self._pool = Pool(processes=self._processes) def start(self): self._logger.info("Starting PROXY collector") self._watcher.start() try: while True: #self._ingest_files() self._ingest_files_pool() time.sleep(self._ingestion_interval) except KeyboardInterrupt: self._logger.info("Stopping Proxy collector...") Util.remove_kafka_topic(self._kafka_topic.Zookeeper,self._kafka_topic.Topic,self._logger) self._watcher.stop() self._pool.terminate() self._pool.close() self._pool.join() def _ingest_files_pool(self): if self._watcher.HasFiles: for x in range(0,self._processes): file = self._watcher.GetNextFile() resutl = self._pool.apply_async(ingest_file,args=(file,self._message_size,self._kafka_topic.Topic,self._kafka_topic.BootstrapServers)) #resutl.get() # to debug add try and catch. if not self._watcher.HasFiles: break return True def ingest_file(file,message_size,topic,kafka_servers): logger = logging.getLogger('SPOT.INGEST.PROXY.{0}'.format(os.getpid())) try: message = "" logger.info("Ingesting file: {0} process:{1}".format(file,os.getpid())) with open(file,"rb") as f: for line in f: message += line if len(message) > message_size: KafkaProducer.SendMessage(message, kafka_servers, topic, 0) message = "" #send the last package. KafkaProducer.SendMessage(message, kafka_servers, topic, 0) rm_file = "rm {0}".format(file) Util.execute_cmd(rm_file,logger) logger.info("File {0} has been successfully sent to Kafka Topic: {1}".format(file,topic)) except Exception as err: logger.error("There was a problem, please check the following error message:{0}".format(err.message)) logger.error("Exception: {0}".format(err))