// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl;

import java.util.*;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.SentenceEnhancer;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.grammar.DependencyGraph;

/** Extract mentions from the jCas. */
public class MentionDetector {

  private final JCas jCas;
  private final DependencyGraph dependencyGraph;
  private List<WordToken> pronouns;
  private Collection<Entity> entities;

  /** Create a new mention detector using the specified JCas and DependencyGraph */
  public MentionDetector(JCas jCas, DependencyGraph dependencyGraph) {
    this.jCas = jCas;
    this.dependencyGraph = dependencyGraph;
  }

  /**
   * Detect mentions in the JCas and DependencyGraph (provided in the constructor) and return them
   * as a list
   */
  public List<Mention> detect() {
    setup();

    final List<Mention> mentions = new ArrayList<>(pronouns.size() + entities.size());

    detectPronouns(mentions);

    detectEntities(mentions);

    detectPhrases(mentions);

    new SentenceEnhancer().enhance(jCas, mentions);

    return mentions;
  }

  private void setup() {
    pronouns =
        JCasUtil.select(jCas, WordToken.class).stream()
            .filter(
                w ->
                    w.getPartOfSpeech().startsWith("PP")
                        || w.getPartOfSpeech().startsWith("WP")
                        || w.getPartOfSpeech().startsWith("PRP"))
            .collect(Collectors.toList());

    entities = JCasUtil.select(jCas, Entity.class);
  }

  private void detectPronouns(List<Mention> mentions) {
    pronouns.stream()
        .map(Mention::new)
        .map(
            m -> {
              final List<WordToken> list = Collections.singletonList((WordToken) m.getAnnotation());
              m.setWords(list);
              return m;
            })
        .forEach(mentions::add);
  }

  private void detectEntities(Collection<Mention> mentions) {
    entities.stream()
        .map(Mention::new)
        .map(
            m -> {
              final Collection<WordToken> list =
                  JCasUtil.selectCovered(jCas, WordToken.class, m.getAnnotation());
              m.setWords(new ArrayList<WordToken>(list));
              m.setHeadWordToken(determineHead(m.getWords()));
              return m;
            })
        .forEach(mentions::add);
  }

  private WordToken determineHead(List<WordToken> words) {
    // A dependency grammar approach to head word extraction
    // - find the Noun in the noun phrase which is the link out of the words
    // - this seems to be the head word
    // TODO: Investigate other approachces Collin 1999, etc. Do they give the same/better results?

    if (words.size() == 1) {
      return words.get(0);
    } else {
      final List<WordToken> candidates = identifyHeadCandidates(words);

      if (candidates.isEmpty()) {
        return null;
      }

      // TODO: No idea if its it possible to get more than one if all things work.
      // I think this would be a case of marking an entity which cross the NP boundary and is likely
      // wrong.
      // TODO: Not sure if we should pull out compound words here... (its a head word but even so)

      return candidates.get(0);
    }
  }

  private List<WordToken> identifyHeadCandidates(List<WordToken> words) {
    final List<WordToken> candidates = new LinkedList<>();

    for (final WordToken word : words) {
      if (word.getPartOfSpeech().startsWith("N")) {
        final Stream<WordToken> edges = dependencyGraph.getEdges(word);
        if (edges.anyMatch(p -> !words.contains(p))) {
          candidates.add(word);
        }
      }
    }

    return candidates;
  }

  private void detectPhrases(List<Mention> mentions) {

    // Limit to noun phrases
    final List<PhraseChunk> phrases =
        JCasUtil.select(jCas, PhraseChunk.class).stream()
            .filter(p -> p.getChunkType().startsWith("N"))
            .collect(Collectors.toList());

    // Remove any noun phrases which cover entities
    JCasUtil.indexCovering(jCas, Entity.class, PhraseChunk.class).values().stream()
        .flatMap(Collection::stream)
        .forEach(phrases::remove);

    final Map<PhraseChunk, List<WordToken>> phraseToWord =
        JCasUtil.indexCovered(jCas, PhraseChunk.class, WordToken.class);

    // Create an index for head words
    final Multimap<WordToken, PhraseChunk> headToChunk = HashMultimap.create();
    phrases.stream()
        .forEach(
            p -> {
              final Collection<WordToken> collection = phraseToWord.get(p);
              final WordToken head = determineHead(new ArrayList<>(collection));
              if (head != null) {
                headToChunk.put(head, p);
              }
              // TODO: What should we do to those without heads?
            });

    // Paper: keep the largest noun phrase which has the same head word.
    headToChunk.asMap().entrySet().stream()
        .filter(e -> e.getValue().size() == 1)
        .forEach(
            e -> {
              PhraseChunk largest = null;
              int largestSize = 0;

              for (final PhraseChunk p : e.getValue()) {
                // the head is always common word, so we know they overlap
                final int size = p.getEnd() - p.getBegin();
                if (largest == null || largestSize < size) {
                  largest = p;
                  largestSize = size;
                }
              }

              // Remove all the small ones
              for (final PhraseChunk p : e.getValue()) {
                if (!p.equals(largest)) {
                  phrases.removeAll(headToChunk.values());
                }
              }
            });

    // Remove all phrases based on their single content
    JCasUtil.indexCovering(jCas, PhraseChunk.class, WordToken.class).entrySet().stream()
        .filter(e -> e.getValue().size() == 1)
        .filter(e -> filterBySingleContent(e.getValue().iterator().next()))
        .map(Entry::getKey)
        .forEach(phrases::remove);

    // TODO: Remove all pronouns which are covered by the phrases? I think not...
    // TODO: Paper removes It if possible (see Appendix B for regex)
    // TODO: Paper removes static list of stop words (but we should determine that ourselves)
    // TODO: Paper removes partivit or quantifier (millions of people). Unsure why though.

    phrases.stream()
        .map(Mention::new)
        .map(
            m -> {
              final List<WordToken> words = new ArrayList<>(phraseToWord.get(m.getAnnotation()));
              // TODO: We already calculated this early (for headToWord), but we just redo again
              // here. Would be nice to reuse
              m.setWords(words);
              m.setHeadWordToken(determineHead(words));
              return m;
            })
        .forEach(mentions::add);
  }

  private boolean filterBySingleContent(WordToken t) {
    if (pronouns.contains(t)) {
      // Remove NP which are
      return true;
    } else {
      // Paper: Remove cardinal / numerics
      return "CD".equalsIgnoreCase(t.getPartOfSpeech());
    }
  }
}