/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.dag.history.logging.ats;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.hadoop.yarn.api.records.timeline.TimelineEntityGroupId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntity;
import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse;
import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse.TimelinePutError;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.tez.common.ReflectionUtils;
import org.apache.tez.common.TezUtilsInternal;
import org.apache.tez.common.security.DAGAccessControls;
import org.apache.tez.common.security.HistoryACLPolicyException;
import org.apache.tez.common.security.HistoryACLPolicyManager;
import org.apache.tez.dag.api.DagTypeConverters;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezConstants;
import org.apache.tez.dag.api.TezReflectionException;
import org.apache.tez.dag.api.records.DAGProtos.DAGPlan;
import org.apache.tez.dag.history.DAGHistoryEvent;
import org.apache.tez.dag.history.HistoryEvent;
import org.apache.tez.dag.history.HistoryEventType;
import org.apache.tez.dag.history.events.DAGSubmittedEvent;
import org.apache.tez.dag.history.logging.HistoryLoggingService;
import org.apache.tez.dag.history.events.DAGRecoveredEvent;
import org.apache.tez.dag.history.logging.EntityTypes;
import org.apache.tez.dag.records.TezDAGID;

import com.google.common.annotations.VisibleForTesting;

public class ATSV15HistoryLoggingService extends HistoryLoggingService {

  private static final Logger LOG = LoggerFactory.getLogger(ATSV15HistoryLoggingService.class);

  @VisibleForTesting
  LinkedBlockingQueue<DAGHistoryEvent> eventQueue =
      new LinkedBlockingQueue<DAGHistoryEvent>();

  private Thread eventHandlingThread;
  private AtomicBoolean stopped = new AtomicBoolean(false);
  private int eventCounter = 0;
  private int eventsProcessed = 0;
  private final Object lock = new Object();
  private boolean historyLoggingEnabled = true;

  @VisibleForTesting
  TimelineClient timelineClient;

  private HashSet<TezDAGID> skippedDAGs = new HashSet<TezDAGID>();
  private Map<TezDAGID, String> dagDomainIdMap = new HashMap<TezDAGID, String>();
  private long maxTimeToWaitOnShutdown;
  private boolean waitForeverOnShutdown = false;

  private long maxPollingTimeMillis;

  private String sessionDomainId;
  private static final String atsHistoryLoggingServiceClassName =
      ATSV15HistoryLoggingService.class.getName();
  private static final String atsHistoryACLManagerClassName =
      "org.apache.tez.dag.history.ats.acls.ATSV15HistoryACLPolicyManager";

  @VisibleForTesting
  HistoryACLPolicyManager historyACLPolicyManager;

  private int numDagsPerGroup;

  public ATSV15HistoryLoggingService() {
    super(ATSV15HistoryLoggingService.class.getName());
  }

  @Override
  public void serviceInit(Configuration serviceConf) throws Exception {
    Configuration conf = new Configuration(serviceConf);

    String summaryEntityTypesStr = EntityTypes.TEZ_APPLICATION
        + "," + EntityTypes.TEZ_APPLICATION_ATTEMPT
        + "," + EntityTypes.TEZ_DAG_ID;

    // Ensure that summary entity types are defined properly for Tez.
    if (conf.getBoolean(TezConfiguration.TEZ_AM_ATS_V15_OVERRIDE_SUMMARY_TYPES,
        TezConfiguration.TEZ_AM_ATS_V15_OVERRIDE_SUMMARY_TYPES_DEFAULT)) {
      conf.set(YarnConfiguration.TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_SUMMARY_ENTITY_TYPES,
          summaryEntityTypesStr);
    }

    historyLoggingEnabled = conf.getBoolean(TezConfiguration.TEZ_AM_HISTORY_LOGGING_ENABLED,
        TezConfiguration.TEZ_AM_HISTORY_LOGGING_ENABLED_DEFAULT);
    if (!historyLoggingEnabled) {
      LOG.info("ATSService: History Logging disabled. "
          + TezConfiguration.TEZ_AM_HISTORY_LOGGING_ENABLED + " set to false");
      return;
    }

    if (conf.getBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED,
      YarnConfiguration.DEFAULT_TIMELINE_SERVICE_ENABLED)) {
      timelineClient = TimelineClient.createTimelineClient();
      timelineClient.init(conf);
    } else {
      this.timelineClient = null;
      if (conf.get(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, "")
        .equals(atsHistoryLoggingServiceClassName)) {
        LOG.warn(atsHistoryLoggingServiceClassName
            + " is disabled due to Timeline Service being disabled, "
            + YarnConfiguration.TIMELINE_SERVICE_ENABLED + " set to false");
      }
    }
    maxTimeToWaitOnShutdown = conf.getLong(
        TezConfiguration.YARN_ATS_EVENT_FLUSH_TIMEOUT_MILLIS,
        TezConfiguration.YARN_ATS_EVENT_FLUSH_TIMEOUT_MILLIS_DEFAULT);
    maxPollingTimeMillis = conf.getInt(
        TezConfiguration.YARN_ATS_MAX_POLLING_TIME_PER_EVENT,
        TezConfiguration.YARN_ATS_MAX_POLLING_TIME_PER_EVENT_DEFAULT);
    if (maxTimeToWaitOnShutdown < 0) {
      waitForeverOnShutdown = true;
    }

    LOG.info("Initializing " + ATSV15HistoryLoggingService.class.getSimpleName() + " with "
        + ", maxPollingTime(ms)=" + maxPollingTimeMillis
        + ", waitTimeForShutdown(ms)=" + maxTimeToWaitOnShutdown
        + ", TimelineACLManagerClass=" + atsHistoryACLManagerClassName);

    try {
      historyACLPolicyManager = ReflectionUtils.createClazzInstance(
          atsHistoryACLManagerClassName);
      historyACLPolicyManager.setConf(conf);
    } catch (TezReflectionException e) {
      LOG.warn("Could not instantiate object for " + atsHistoryACLManagerClassName
          + ". ACLs cannot be enforced correctly for history data in Timeline", e);
      if (!conf.getBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS,
          TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS_DEFAULT)) {
        throw e;
      }
      historyACLPolicyManager = null;
    }

    numDagsPerGroup = conf.getInt(TezConfiguration.TEZ_HISTORY_LOGGING_TIMELINE_NUM_DAGS_PER_GROUP,
        TezConfiguration.TEZ_HISTORY_LOGGING_TIMELINE_NUM_DAGS_PER_GROUP_DEFAULT);
  }

  @Override
  public void serviceStart() {
    if (!historyLoggingEnabled || timelineClient == null) {
      return;
    }
    timelineClient.start();

    // create a session domain id, if it fails then disable history logging.
    try {
      sessionDomainId = createSessionDomain();
    } catch (HistoryACLPolicyException | IOException e) {
      LOG.warn("Could not setup history acls, disabling history logging.", e);
      historyLoggingEnabled = false;
      return;
    }

    eventHandlingThread = new Thread(new Runnable() {
      @Override
      public void run() {
        boolean interrupted = false;
        TezUtilsInternal.setHadoopCallerContext(appContext.getHadoopShim(),
            appContext.getApplicationID());
        while (!stopped.get() && !Thread.currentThread().isInterrupted()
              && !interrupted) {

          // Log the size of the event-queue every so often.
          if (eventCounter != 0 && eventCounter % 1000 == 0) {
            if (eventsProcessed != 0 && !eventQueue.isEmpty()) {
              LOG.info("Event queue stats"
                  + ", eventsProcessedSinceLastUpdate=" + eventsProcessed
                  + ", eventQueueSize=" + eventQueue.size());
            }
            eventCounter = 0;
            eventsProcessed = 0;
          } else {
            ++eventCounter;
          }

          synchronized (lock) {
            try {
              DAGHistoryEvent event = eventQueue.poll(maxPollingTimeMillis, TimeUnit.MILLISECONDS);
              if (event == null) {
                continue;
              }
              if (!isValidEvent(event)) {
                continue;
              }

              try {
                handleEvents(event);
                eventsProcessed += 1;
              } catch (Exception e) {
                LOG.warn("Error handling events", e);
              }
            } catch (InterruptedException e) {
              // Finish processing events and then return
              interrupted = true;
            }
          }
        }
      }
    }, "HistoryEventHandlingThread");
    eventHandlingThread.start();
  }

  @Override
  public void serviceStop() {
    LOG.info("Stopping ATSService"
        + ", eventQueueBacklog=" + eventQueue.size());
    stopped.set(true);
    if (eventHandlingThread != null) {
      eventHandlingThread.interrupt();
    }
    try {
      TezUtilsInternal.setHadoopCallerContext(appContext.getHadoopShim(),
          appContext.getApplicationID());
      synchronized (lock) {
        if (!eventQueue.isEmpty()) {
          LOG.warn("ATSService being stopped"
              + ", eventQueueBacklog=" + eventQueue.size()
              + ", maxTimeLeftToFlush=" + maxTimeToWaitOnShutdown
              + ", waitForever=" + waitForeverOnShutdown);
          long startTime = appContext.getClock().getTime();
          long endTime = startTime + maxTimeToWaitOnShutdown;
          while (waitForeverOnShutdown || (endTime >= appContext.getClock().getTime())) {
            try {
              DAGHistoryEvent event = eventQueue.poll(maxPollingTimeMillis, TimeUnit.MILLISECONDS);
              if (event == null) {
                LOG.info("Event queue empty, stopping ATS Service");
                break;
              }
              if (!isValidEvent(event)) {
                continue;
              }
              try {
                handleEvents(event);
              } catch (Exception e) {
                LOG.warn("Error handling event", e);
              }
            } catch (InterruptedException e) {
              LOG.info("ATSService interrupted while shutting down. Exiting."
                  + " EventQueueBacklog=" + eventQueue.size());
            }
          }
        }
      }
    } finally {
      appContext.getHadoopShim().clearHadoopCallerContext();
    }
    if (!eventQueue.isEmpty()) {
      LOG.warn("Did not finish flushing eventQueue before stopping ATSService"
          + ", eventQueueBacklog=" + eventQueue.size());
    }
    if (timelineClient != null) {
      timelineClient.stop();
    }
    if (historyACLPolicyManager != null) {
      historyACLPolicyManager.close();
    }
  }

  @VisibleForTesting
  public TimelineEntityGroupId getGroupId(DAGHistoryEvent event) {
    // Changing this function will impact TimelineCachePluginImpl and should be done very
    // carefully to account for handling different versions of Tez
    switch (event.getHistoryEvent().getEventType()) {
      case DAG_SUBMITTED:
      case DAG_INITIALIZED:
      case DAG_STARTED:
      case DAG_FINISHED:
      case DAG_KILL_REQUEST:
      case VERTEX_INITIALIZED:
      case VERTEX_STARTED:
      case VERTEX_CONFIGURE_DONE:
      case VERTEX_FINISHED:
      case TASK_STARTED:
      case TASK_FINISHED:
      case TASK_ATTEMPT_STARTED:
      case TASK_ATTEMPT_FINISHED:
      case DAG_COMMIT_STARTED:
      case VERTEX_COMMIT_STARTED:
      case VERTEX_GROUP_COMMIT_STARTED:
      case VERTEX_GROUP_COMMIT_FINISHED:
      case DAG_RECOVERED:
        String entityGroupId = numDagsPerGroup > 1
            ? event.getDagID().getGroupId(numDagsPerGroup)
            : event.getDagID().toString();
        return TimelineEntityGroupId.newInstance(event.getDagID().getApplicationId(), entityGroupId);
      case APP_LAUNCHED:
      case AM_LAUNCHED:
      case AM_STARTED:
      case CONTAINER_LAUNCHED:
      case CONTAINER_STOPPED:
        return TimelineEntityGroupId.newInstance(appContext.getApplicationID(),
            appContext.getApplicationID().toString());
    }
    return null;
  }

  public void handle(DAGHistoryEvent event) {
    if (historyLoggingEnabled && timelineClient != null) {
      eventQueue.add(event);
    }
  }

  private boolean isValidEvent(DAGHistoryEvent event) {
    HistoryEventType eventType = event.getHistoryEvent().getEventType();
    TezDAGID dagId = event.getDagID();

    if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
      DAGSubmittedEvent dagSubmittedEvent =
          (DAGSubmittedEvent) event.getHistoryEvent();
      String dagName = dagSubmittedEvent.getDAGName();
      if ((dagName != null
          && dagName.startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX))
          || (!dagSubmittedEvent.isHistoryLoggingEnabled())) {
        // Skip recording pre-warm DAG events
        skippedDAGs.add(dagId);
        return false;
      }
    }
    if (eventType.equals(HistoryEventType.DAG_RECOVERED)) {
      DAGRecoveredEvent dagRecoveredEvent = (DAGRecoveredEvent) event.getHistoryEvent();
      if (!dagRecoveredEvent.isHistoryLoggingEnabled()) {
        skippedDAGs.add(dagRecoveredEvent.getDagID());
        return false;
      }
    }
    if (eventType.equals(HistoryEventType.DAG_FINISHED)) {
      // Remove from set to keep size small
      // No more events should be seen after this point.
      if (skippedDAGs.remove(dagId)) {
        return false;
      }
    }

    if (dagId != null && skippedDAGs.contains(dagId)) {
      // Skip pre-warm DAGs
      return false;
    }

    return true;
  }

  private void handleEvents(DAGHistoryEvent event) {
    String domainId = getDomainForEvent(event);
    // skippedDags is updated in the above call so check again.
    if (event.getDagID() != null && skippedDAGs.contains(event.getDagID())) {
      return;
    }
    TimelineEntityGroupId groupId = getGroupId(event);
    List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(
        event.getHistoryEvent());
    for (TimelineEntity entity : entities) {
      logEntity(groupId, entity, domainId);
    }
  }

  private void logEntity(TimelineEntityGroupId groupId, TimelineEntity entity, String domainId) {
    if (historyACLPolicyManager != null && domainId != null && !domainId.isEmpty()) {
      historyACLPolicyManager.updateTimelineEntityDomain(entity, domainId);
    }

    try {
      TimelinePutResponse response = timelineClient.putEntities(
          appContext.getApplicationAttemptId(), groupId, entity);
      if (response != null
          && !response.getErrors().isEmpty()) {
        int count = response.getErrors().size();
        for (int i = 0; i < count; ++i) {
          TimelinePutError err = response.getErrors().get(i);
          if (err.getErrorCode() != 0) {
            LOG.warn("Could not post history event to ATS"
                + ", atsPutError=" + err.getErrorCode()
                + ", entityId=" + err.getEntityId());
          }
        }
      }
      // Do nothing additional, ATS client library should handle throttling
      // or auto-disable as needed
    } catch (Exception e) {
      LOG.warn("Could not handle history events", e);
    }
  }

  private String getDomainForEvent(DAGHistoryEvent event) {
    String domainId = sessionDomainId;
    if (historyACLPolicyManager == null) {
      return domainId;
    }

    TezDAGID dagId = event.getDagID();
    HistoryEvent historyEvent = event.getHistoryEvent();
    if (dagId == null || !HistoryEventType.isDAGSpecificEvent(historyEvent.getEventType())) {
      return domainId;
    }

    if (dagDomainIdMap.containsKey(dagId)) {
      // If we already have the domain for the dag id return it
      domainId = dagDomainIdMap.get(dagId);
      // Cleanup if this is the last event.
      if (historyEvent.getEventType() == HistoryEventType.DAG_FINISHED) {
        dagDomainIdMap.remove(dagId);
      }
    } else if (HistoryEventType.DAG_SUBMITTED == historyEvent.getEventType()
        || HistoryEventType.DAG_RECOVERED == historyEvent.getEventType()) {
      // In case this is the first event for the dag, create and populate dag domain.
      Configuration conf;
      DAGPlan dagPlan;
      if (HistoryEventType.DAG_SUBMITTED == historyEvent.getEventType()) {
          conf = ((DAGSubmittedEvent)historyEvent).getConf();
          dagPlan = ((DAGSubmittedEvent)historyEvent).getDAGPlan();
      } else {
         conf = appContext.getCurrentDAG().getConf();
         dagPlan = appContext.getCurrentDAG().getJobPlan();
      }
      domainId = createDagDomain(conf, dagPlan, dagId);

      // createDagDomain updates skippedDAGs so another check here.
      if (skippedDAGs.contains(dagId)) {
        return null;
      }

      dagDomainIdMap.put(dagId, domainId);
    }
    return domainId;
  }

  /**
   * Creates a domain for the session.
   * @return domainId to be used. null if acls are disabled.
   * @throws HistoryACLPolicyException, IOException Forward if historyACLPolicyManger exception.
   */
  private String createSessionDomain() throws IOException, HistoryACLPolicyException {
    if (historyACLPolicyManager == null) {
      return null;
    }
    Map<String, String> domainInfo = historyACLPolicyManager.setupSessionACLs(getConfig(),
        appContext.getApplicationID());
    if (domainInfo != null) {
      return domainInfo.get(TezConfiguration.YARN_ATS_ACL_SESSION_DOMAIN_ID);
    }
    return null;
  }

  /**
   * When running in session mode, create a domain for the dag and return it.
   * @param dagConf The configuration the dag for which domain has to be created.
   * @param dagPlan The dag plan which contains the ACLs.
   * @param dagId The dagId for which domain has to be created.
   * @return The created domain id on success.
   *     sessionDomainId: If there is a failure also disable history logging for this dag.
   *     sessionDomainId: If historyACLPolicyManager returns null.
   */
  private String createDagDomain(Configuration dagConf, DAGPlan dagPlan, TezDAGID dagId) {
    // In non session mode dag domain is same as session domain id.
    if (!appContext.isSession()) {
      return sessionDomainId;
    }
    DAGAccessControls dagAccessControls = dagPlan.hasAclInfo()
        ? DagTypeConverters.convertDAGAccessControlsFromProto(dagPlan.getAclInfo())
        : null;
    try {
      Map<String, String> domainInfo = historyACLPolicyManager.setupSessionDAGACLs(
          dagConf, appContext.getApplicationID(), Integer.toString(dagId.getId()),
          dagAccessControls);
      if (domainInfo != null) {
        return domainInfo.get(TezConfiguration.YARN_ATS_ACL_DAG_DOMAIN_ID);
      }
      // Fallback to session domain, if domainInfo was null
      return sessionDomainId;
    } catch (IOException | HistoryACLPolicyException e) {
      LOG.warn("Could not setup ACLs for DAG, disabling history logging for dag.", e);
      skippedDAGs.add(dagId);
      // Return value is not used, check for skippedDAG is important.
      return null;
    }
  }
}