/**
 * Copyright 2013 Diego Ceccarelli
 *
 * <p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 *
 * <p>http://www.apache.org/licenses/LICENSE-2.0
 *
 * <p>Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package it.cnr.isti.hpc.wikipedia.parser;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.ContentElement;
import de.tudarmstadt.ukp.wikipedia.parser.DefinitionList;
import de.tudarmstadt.ukp.wikipedia.parser.NestedList;
import de.tudarmstadt.ukp.wikipedia.parser.NestedListContainer;
import de.tudarmstadt.ukp.wikipedia.parser.Paragraph;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import de.tudarmstadt.ukp.wikipedia.parser.Span;
import de.tudarmstadt.ukp.wikipedia.parser.mediawiki.MediaWikiParser;
import it.cnr.isti.hpc.wikipedia.article.Article;
import it.cnr.isti.hpc.wikipedia.article.ArticleHelper;
import it.cnr.isti.hpc.wikipedia.article.ArticleType;
import it.cnr.isti.hpc.wikipedia.article.Language;
import it.cnr.isti.hpc.wikipedia.article.Link;
import it.cnr.isti.hpc.wikipedia.article.LinkType;
import it.cnr.isti.hpc.wikipedia.article.Table;
import it.cnr.isti.hpc.wikipedia.article.Template;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Generates a Mediawiki parser given a language, (it will expect to find a locale file in
 * <tt>src/main/resources/</tt>).
 *
 * @see Locale
 * @author Diego Ceccarelli <[email protected]>
 *     <p>Created on Feb 14, 2013
 */
public class ArticleParser {

  static MediaWikiParserFactory parserFactory = new MediaWikiParserFactory();

  private static final Logger logger = LoggerFactory.getLogger(ArticleParser.class);

  /** the language (used for the locale) default is English * */
  private static final Language DEFAULT_LANGUAGE = Language.EN;

  private Language lang;

  static int shortDescriptionLength = 500;
  private final List<String> redirects;

  private final MediaWikiParser parser;
  private final Locale locale;

  public ArticleParser(String lang) {
    this.lang = Language.valueOf(lang.toUpperCase());
    parser = parserFactory.getParser(lang.toLowerCase());
    locale = new Locale(lang.toLowerCase());
    redirects = locale.getRedirectIdentifiers();
  }

  public ArticleParser(Language lang) {
    this(lang.toString().toUpperCase());
  }

  public void parse(Article.Builder article, String mediawiki) {
    final ParsedPage page = parser.parse(mediawiki);
    setRedirect(article, mediawiki);
    parse(article, page);
  }

  private void parse(Article.Builder article, ParsedPage page) {
    article.setLang(lang);
    setWikiTitle(article);
    if (page == null) {
      logger.warn("page is null for article {}", article.getTitle());
    } else {
      setParagraphs(article, page);
      // setShortDescription(article);
      setTemplates(article, page);
      setLinks(article, page);
      setCategories(article, page);
      setHighlights(article, page);
      setSections(article, page);
      setTables(article, page);
      setEnWikiTitle(article, page);
      setLists(article, page);
    }
    setRedirect(article);
    setDisambiguation(article);
    setIsList(article);
  }

  private void setLists(Article.Builder article, ParsedPage page) {
    List<List<String>> lists = new ArrayList<List<String>>();
    for (DefinitionList dl : page.getDefinitionLists()) {
      List<String> l = new ArrayList<String>();
      for (ContentElement c : dl.getDefinitions()) {
        l.add(c.getText());
      }
      lists.add(l);
    }
    for (NestedListContainer dl : page.getNestedLists()) {
      List<String> l = new ArrayList<String>();
      for (NestedList nl : dl.getNestedLists()) {
        l.add(nl.getText());
      }
      lists.add(l);
    }
    article.setLists(lists);
  }

  private void setWikiTitle(Article.Builder article) {
    article.setWikiTitle(ArticleHelper.getTitleInWikistyle(article.getTitle()));
  }

  private void addLink(
      Link.Builder linkBuilder,
      de.tudarmstadt.ukp.wikipedia.parser.Link.type linkType,
      final List<Link> links,
      final List<Link> externalLinks) {
    if (linkType == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL) {
      links.add(linkBuilder.build());
    } else if (linkType == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL) {
      externalLinks.add(linkBuilder.build());
    } else if (linkType == de.tudarmstadt.ukp.wikipedia.parser.Link.type.IMAGE) {
      links.add(linkBuilder.build());
    }
  }

  private Link.Builder createLink(
      final de.tudarmstadt.ukp.wikipedia.parser.Link link, final LinkType jsonWikipediaType) {
    if (link.getTarget().isEmpty()) {
      logger.warn("Empty link target for link {}", link);
      return null;
    }

    Link.Builder linkBuilder = Link.newBuilder();
    if (link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.INTERNAL
        || link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.EXTERNAL
        || link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.IMAGE) {
      linkBuilder.setId(link.getTarget());
      linkBuilder.setAnchor(link.getText());
      linkBuilder.setStart(link.getPos().getStart());
      linkBuilder.setEnd(link.getPos().getEnd());
      linkBuilder.setType(jsonWikipediaType);
      if (link.getText().isEmpty()) {
        linkBuilder.setAnchor(ArticleHelper.wikiStyleToText(link.getTarget()));
        if (link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.IMAGE) {
          List<String> parameters = link.getParameters();
          if (!parameters.isEmpty()) {
            linkBuilder.setAnchor(parameters.get(parameters.size() - 1));
          }
        }
      }
      if (link.getType() == de.tudarmstadt.ukp.wikipedia.parser.Link.type.IMAGE) {
        linkBuilder.setType(LinkType.IMAGE);
      }

      return linkBuilder;
    }
    logger.debug("No link for [{}] built: link type {} ", link.getText(), link.getType());
    return null;
  }

  private void setLinksInParagraphs(
      final List<Link> links, final List<Link> externalLinks, ParsedPage page) {
    int paragraphId = 0;
    for (Paragraph p : page.getParagraphs()) {
      for (de.tudarmstadt.ukp.wikipedia.parser.Link link : p.getLinks()) {
        Link.Builder linkAdded = createLink(link, LinkType.BODY);
        if (linkAdded != null) {
          linkAdded.setParagraphId(paragraphId);
          addLink(linkAdded, link.getType(), links, externalLinks);
        }
      }
      paragraphId++;
    }
  }

  private void setLinksInTables(
      final List<Link> links, final List<Link> externalLinks, ParsedPage page) {
    int tableId = 0;
    for (de.tudarmstadt.ukp.wikipedia.parser.Table p : page.getTables()) {

      for (int el = 0; el < p.nrOfTableElements(); el++) {
        int col = p.getTableElement(el).getCol();
        int row = p.getTableElement(el).getRow();
        for (de.tudarmstadt.ukp.wikipedia.parser.Link link : p.getTableElement(el).getLinks()) {
          Link.Builder linkAdded = createLink(link, LinkType.TABLE);
          if (linkAdded != null) {
            linkAdded.setTableId(tableId);
            linkAdded.setRowId(row);
            linkAdded.setColumnId(col);
            addLink(linkAdded, link.getType(), links, externalLinks);
          }
        }
      }
      tableId++;
    }
  }

  private void setLinksInLists(
      final List<Link> links, final List<Link> externalLinks, ParsedPage page) {
    int listId = 0;
    for (NestedListContainer p : page.getNestedLists()) {
      int item = 0;
      for (NestedList list : p.getNestedLists()) {
        for (de.tudarmstadt.ukp.wikipedia.parser.Link link : list.getLinks()) {
          Link.Builder linkAdded = createLink(link, LinkType.LIST);
          if (linkAdded != null) {
            linkAdded.setListId(listId);
            linkAdded.setListItem(item);
            addLink(linkAdded, link.getType(), links, externalLinks);
          }
        }
        item++;
      }
      listId++;
    }
  }

  private void setLinks(Article.Builder article, ParsedPage page) {
    final List<Link> links = new ArrayList<Link>(page.getLinks().size());
    final List<Link> elinks = new ArrayList<Link>(page.getLinks().size());
    setLinksInParagraphs(links, elinks, page);
    setLinksInTables(links, elinks, page);
    setLinksInLists(links, elinks, page);
    article.setLinks(links);
    article.setExternalLinks(elinks);
  }

  private void setIsList(Article.Builder article) {
    for (final String list : locale.getListIdentifiers()) {
      if (StringUtils.startsWithIgnoreCase(article.getTitle(), list)) {
        article.setType(ArticleType.LIST);
      }
    }
  }

  private void setRedirect(Article.Builder article) {
    if (!article.hasRedirect() || !article.getRedirect().isEmpty()) {
      return;
    }
    final List<List<String>> lists = article.getLists();
    if ((!lists.isEmpty()) && (!lists.get(0).isEmpty())) {
      // checking only first item in first list
      final String line = lists.get(0).get(0);

      for (final String redirect : redirects) {
        if (StringUtils.startsWithIgnoreCase(line, redirect)) {
          final int pos = line.indexOf(' ');
          if (pos < 0) {
            return;
          }
          String red = line.substring(pos).trim();
          red = ArticleHelper.getTitleInWikistyle(red);
          article.setRedirect(red);
          article.setType(ArticleType.REDIRECT);
          return;
        }
      }
    }
  }

  private void setRedirect(Article.Builder article, String mediawiki) {
    for (final String redirect : redirects) {
      if (StringUtils.startsWithIgnoreCase(mediawiki, redirect)) {
        final int start = mediawiki.indexOf("[[") + 2;
        final int end = mediawiki.indexOf("]]");
        if ((start < 0) || (end < 0)) {
          logger.warn("cannot find the redirect {}\n mediawiki: {}", article.getTitle(), mediawiki);
          continue;
        }
        final String r = ArticleHelper.getTitleInWikistyle(mediawiki.substring(start, end));
        article.setRedirect(r);
        article.setType(ArticleType.REDIRECT);
      }
    }
  }

  private void ensureLength(List<List<String>> rows, int minLen) {
    while (rows.size() < minLen) {
      rows.add(new ArrayList<>());
    }
  }

  private void setTables(Article.Builder article, ParsedPage page) {
    final List<Table> tables = new ArrayList<>();

    for (final de.tudarmstadt.ukp.wikipedia.parser.Table t : page.getTables()) {
      List<List<String>> table = new ArrayList<>();

      final String title;
      if (t.getTitleElement() != null) {
        title = t.getTitleElement().getText() != null ? t.getTitleElement().getText() : "";
      } else {
        title = "";
      }
      List<String> currentRow = null;
      int maxCols = 0;
      for (int elementId = 0; elementId < t.nrOfTableElements(); elementId++) {
        int col = t.getTableElement(elementId).getCol();
        int row = t.getTableElement(elementId).getRow(); // row starts from 1
        maxCols = Math.max(maxCols, col);
        final String elementText = t.getTableElement(elementId).getText();
        logger.debug(String.format("row:%s col:%s content:%s", row, col, elementText));
        ensureLength(table, row + 1);
        // assume that columns come in order
        table.get(row).add(elementText);
      }
      if (table.size() > 0 && table.get(0).isEmpty()) {
        // first raw is empty:
        table.remove(0);
      }
      logger.debug("ParsedTable: \n {}", table);

      final Table.Builder tableBuilder = Table.newBuilder().setTitle(title);
      tableBuilder.setTable(table);
      tableBuilder.setNumCols(maxCols);
      tableBuilder.setNumRows(table.size());
      tables.add(tableBuilder.build());
    }
    article.setTables(tables);
  }

  protected void setEnWikiTitle(Article.Builder article, ParsedPage page) {
    if (article.getLang().equals(Language.EN)) {
      return;
    }
    try {
      if (page.getLanguages() == null) {
        article.setEnWikiTitle("");
        return;
      }
    } catch (final NullPointerException e) {
      // FIXME title is always null!
      logger.warn("no languages for page {} ", article.getTitle());
      return;
    }
    for (final de.tudarmstadt.ukp.wikipedia.parser.Link l : page.getLanguages()) {
      if (l.getText().startsWith("en:")) {
        article.setEnWikiTitle(l.getTarget().substring(3));
        break;
      }
    }
  }

  private void setSections(Article.Builder article, ParsedPage page) {
    final List<String> sections = new ArrayList<String>(10);
    for (final Section s : page.getSections()) {

      if ((s == null) || (s.getTitle() == null)) {
        continue;
      }
      sections.add(s.getTitle());
    }
    article.setSections(sections);
  }

  private void setTemplates(Article.Builder article, ParsedPage page) {
    final List<Template> templates = new ArrayList<Template>(10);

    for (final de.tudarmstadt.ukp.wikipedia.parser.Template t : page.getTemplates()) {
      final List<String> templateParameters = t.getParameters();
      parseTemplatesSchema(article, templateParameters);

      if (t.getName().toLowerCase().startsWith("infobox")) {
        article.setInfobox(new Template(t.getName(), templateParameters));
      } else {
        templates.add(new Template(t.getName(), templateParameters));
      }
    }
    article.setTemplates(templates);
  }

  /** @param templateParameters */
  private void parseTemplatesSchema(Article.Builder article, List<String> templateParameters) {
    final List<String> schema = new ArrayList<String>(10);

    for (final String s : templateParameters) {
      try {
        if (s.contains("=")) {
          final String attributeName = s.split("=")[0].trim().toLowerCase();
          schema.add(attributeName);
        }
      } catch (final Exception e) {
        continue;
      }
    }
    article.setTemplatesSchema(schema);
  }

  private void setCategories(Article.Builder article, ParsedPage page) {
    final List<Link> categories = new ArrayList<>(10);
    for (final de.tudarmstadt.ukp.wikipedia.parser.Link link : page.getCategories()) {
      Link.Builder linkBuilder = Link.newBuilder();
      linkBuilder.setId(link.getTarget());
      linkBuilder.setAnchor(link.getText());
      linkBuilder.setStart(link.getPos().getStart());
      linkBuilder.setEnd(link.getPos().getEnd());
      linkBuilder.setType(LinkType.CATEGORY);
      categories.add(linkBuilder.build());
    }
    article.setCategories(categories);
  }

  private void setHighlights(Article.Builder article, ParsedPage page) {
    final List<String> highlights = new ArrayList<String>(20);

    for (final Paragraph p : page.getParagraphs()) {
      for (final Span t : p.getFormatSpans(Content.FormatType.BOLD)) {
        highlights.add(t.getText(p.getText()));
      }
      for (final Span t : p.getFormatSpans(Content.FormatType.ITALIC)) {
        highlights.add(t.getText(p.getText()));
      }
    }
    article.setHighlights(highlights);
  }

  private void setParagraphs(Article.Builder article, ParsedPage page) {
    final List<String> paragraphs = new ArrayList<String>(page.nrOfParagraphs());
    for (final Paragraph p : page.getParagraphs()) {
      String text = p.getText();
      if (!text.isEmpty()) {
        paragraphs.add(text);
      }
    }
    article.setParagraphs(paragraphs);
  }

  private void setDisambiguation(Article.Builder a) {

    for (final String disambiguation : locale.getDisambigutionIdentifiers()) {
      if (StringUtils.containsIgnoreCase(a.getTitle(), disambiguation)) {
        a.setType(ArticleType.DISAMBIGUATION);
        return;
      }
      if (a.hasTemplates()) {
        for (final Template t : a.getTemplates()) {
          if (StringUtils.equalsIgnoreCase(t.getName(), disambiguation)) {
            a.setType(ArticleType.DISAMBIGUATION);
            return;
          }
        }
      }
    }
  }
}