// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.templates;

import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;

import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.templates.TemplateRecordDefinition;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;

 * Annotates record definitions found in documents using a regular expression.
 * <p>RecordDefinitions are regions surrounded by &lt;&lt;record:NAME&gt;&gt; and
 * &lt;&lt;record:NAME&gt;&gt; marker text, where NAME is a user defined record type name and must
 * be consistent in the begin and end marker text. Each RecordDefinition should cover one or more
 * TemplateFieldDefinition annotations to be useful downstream.
 * <p>Markers for begin and end may be used to make it clearer in the document but are not required.
 * e.g. &lt;&lt;record:NAME:begin&gt;&gt; and &lt;&lt;record:NAME:end&gt;&gt; or &lt;&lt;record:NAME
 * begin&gt;&gt; and &lt;&lt;record:NAME end&gt;&gt;
 * <p>
 * <p>A repeating record is indicated with the attribute <code>repeat</code> in the begin record
 * marker. e.g. &lt;&lt;record:NAME repeat&gt;&gt; or &lt;&lt;record:NAME repeat="true" &gt;&gt;
 * <p>This annotator should be used in conjunction with {@link TemplateFieldDefinitionAnnotator}.
public class TemplateRecordDefinitionAnnotator extends BaleenAnnotator {

  /** Regular expression used to match records. */
  private static final String RECORD_TOKEN_REGEX =

  /** The Constant REPEAT_ATTRIBUTE */
  private static final String REPEAT_ATTRIBUTE = "repeat";

   * The compiled regular expression - compiled with the DOTALL option (effectively '(?s)' in the
   * regex) to enable matches over multiple lines.
  private static final Pattern RECORD_TOKEN_PATTERN =
      Pattern.compile(RECORD_TOKEN_REGEX, Pattern.DOTALL);

  protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
    String documentText = jCas.getDocumentText();
    Matcher matcher = RECORD_TOKEN_PATTERN.matcher(documentText);
    while (matcher.find()) {
      createRecordDefinitionAnnotation(jCas, matcher);

   * Creates a record definition annotation and adds it to the JCas indexes.
   * @param jCas the JCas
   * @param matcher the matcher that triggered the creation, which must have two groups (first being
   *     the name, and the second being the content within the record)
  private void createRecordDefinitionAnnotation(JCas jCas, Matcher matcher) {
    TemplateRecordDefinition recordDefinition = new TemplateRecordDefinition(jCas);
    addAttributes(recordDefinition, "<record:" + matcher.group(1) + matcher.group(2) + " />");

   * Add the attributes to the given record definition
   * <p>Uses Jsoup to parse the tag as if html
   * @param recordDefinition the record definition
   * @param beginText the begin tag of the record definition
  private void addAttributes(TemplateRecordDefinition recordDefinition, String beginText) {

    Document doc = Jsoup.parseBodyFragment(beginText);
    Element fieldElement = doc.body().child(0);
    Attributes attributes = fieldElement.attributes();
    if (attributes.hasKey(REPEAT_ATTRIBUTE)) {
      String required = attributes.get(REPEAT_ATTRIBUTE);
          Strings.isNullOrEmpty(required) ? true : Boolean.valueOf(required));

  public AnalysisEngineAction getAction() {
    return new AnalysisEngineAction(
        Collections.emptySet(), ImmutableSet.of(TemplateRecordDefinition.class));