package lucene4ir.indexer;

import lucene4ir.Lucene4IRConstants;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;

import java.util.ArrayList;

 * Created by leif on 03/09/2016.
 * Edited by Abdulaziz AlQattan on 16/03/2019.
public class CommonCoreDocumentIndexer extends lucene4ir.indexer.DocumentIndexer {

    // Properties
    Whitelist whiteList;
    private org.jsoup.nodes.Document jdoc;
    ArrayList<Field> fields;

    // Sub- Private Functions

    private String getFieldAttribute(String TagName  , String AttributeName )
        This Function is used to get the attribure value of a specific tag
        in a specific jsoup Document
        String selector =  TagName + "[" + AttributeName + "]";
        return ;
    private String getFieldText(String TagName){
           This function is used to retrieve gathered text from  all elements with a specific tag
           in the input jsoup document
        String fieldText = ""; dns = jdoc.getElementsByTag(TagName);
        if (dns.size() > 0)
            fieldText = dns.text();

        return fieldText;

    private boolean isNumeric(String strNum) {

        This Function is used to check whether an input string is numeric or not
        try {
            double d = Double.parseDouble(strNum);
        } catch (NumberFormatException | NullPointerException nfe) {
            return false;
        return true;

    private String getPubDate()

        This function is Used to get the Publication Date Supposing that :
        the value of content for and element with (Key = name and value = publication_day_of_month) = Day
        the value of content for and element with (Key = name and value = publication_month) = month
        the value of content for and element with (Key = name and value = publication_year) = year

        if any values of the return date is empty or non-Numeric return empty string

        String result = "" , aDay , aMonth , aYear;

        aDay = jdoc.getElementsByAttributeValue("name", "publication_day_of_month").attr("content");
        if (!isNumeric(aDay))
            return result;
        aMonth = jdoc.getElementsByAttributeValue("name", "publication_month").attr("content");
        if (!isNumeric(aMonth))
            return result;
        aYear = jdoc.getElementsByAttributeValue("name", "publication_year").attr("content");
        if (!isNumeric(aYear))
            return result;

        result = aDay + '/' + aMonth + "/" + aYear;
        return result;

    private String removeSpecialCharacters (String input)
        input = input.replace("%20" , " ");
        return input;

    private void addFieldToDocument(String fldName , char fldType)
        This Function is Used to Create a Field based on the input FieldName and FieldType
            if The Field Type = "s" then String Field
            Else ( TextField or TermVectorTextField According to the current indexPositions
            Then add the resultant field to the Document
        Field aField;
        if (fldType == 's')
            aField = new StringField(fldName, "", Field.Store.YES);
        else if (indexPositions)
            aField = new TermVectorEnabledTextField(fldName, "", Field.Store.YES);
            aField = new TextField(fldName, "", Field.Store.YES);

    private void initFields()
        Initialize Fields Collection in the Following Sequence :
        1= Document Number - String Field
        2- Publish Date - String Field
        3- Title - Text Field
        4- Content - Text Field
        5- All - Text Field
        fields = new ArrayList<Field>();
        addFieldToDocument(Lucene4IRConstants.FIELD_TITLE,' ');
        addFieldToDocument(Lucene4IRConstants.FIELD_CONTENT,' ');
        addFieldToDocument(Lucene4IRConstants.FIELD_ALL,' ');

    // Constructor Method
    public CommonCoreDocumentIndexer(String indexPath, String tokenFilterFile, boolean positional){
        super(indexPath, tokenFilterFile, positional);
        // Create String Corpus Fields and add them to the Document doc


    private void initWhiteList()
        // Add White List Of The Document
        // The White List Are The Tags to keep in the Document after Reading From Jsoup
        try {
            // Add The popular tags in the white list
            whiteList = Whitelist.relaxed();

            // Add each allowed tag with its allowed attributes

            // *******************

            whiteList.addAttributes("meta","name" , "content");

            // *******************


            // *******************

            whiteList.addAttributes("doc.copyright","year" , "holder");

            // *******************

            whiteList.addAttributes("doc.copyright","year" , "holder");

            // *******************



        } catch (Exception e){
            System.out.println(" caught a " + e.getClass() +
                    "\n with message: " + e.getMessage());

    public void indexDocumentsFromFile(String filename){

        This method is used to  :
            1- read an input file (filepath) Line by Line
            2- Gather all of these lines in one Text String
            3- add a Line Separator between Lines in the Resultant Text
            4- Send The Resultant Text To The Method extractFieldsFromXmlAndIndex
                to identify the xml tags and Index them according to the needs
        String line , XMLText = "";
        short lineNumber = 1;
        try {
            BufferedReader br = openDocumentFile(filename);
            try {
                line = br.readLine();
                while (line != null){
                    if (lineNumber > 3 && !line.startsWith("</nitf>"))
                        XMLText += line + System.lineSeparator();
                    line = br.readLine();

            } finally {
                if (!XMLText.isEmpty())
        catch (Exception e){
            System.out.println(" caught a " + e.getClass() +  e.getLocalizedMessage() +
                    "\n with message: " + e.getMessage());

       public void extractFieldsFromXmlAndIndex(String xmlString){
            String   docnum,
                     title ,
                     pubdate ="",
                     safeText = org.jsoup.Jsoup.clean(xmlString,whiteList),
                    values[] = new String[5];
           Document doc;
            jdoc = org.jsoup.Jsoup.parse(safeText);

            docnum = getFieldAttribute("doc-id" , "id-string").trim();
            docnum = removeSpecialCharacters(docnum);

            title = getFieldText("title");
            title = removeSpecialCharacters(title);
            pubdate = getPubDate();
            source = getFieldAttribute( "pubdata" , "name");
            source = removeSpecialCharacters(source);

            content = pubdate;
            content += getFieldAttribute("doc.copyright","year" ) + " ";
            content += getFieldAttribute("doc.copyright","holder") + " ";
            content += getFieldText( "classifier") + " ";
            content += source + " ";
            content  += getFieldText( "hl1") + " ";
            content += getFieldText( "p") + " ";

            String all = title + " " + content + " " + source + " " + pubdate;

          /*  The Fields Sequence
            1= Document Number - String Field
           2- Publish Date - String Field
           3- Title - Text Field
           4- Content - Text Field
           5- All - Text Field
            values[0] = docnum;
           values[1] = pubdate;
           values[2] = title;
           values[3] = content;
           values[4] = all;

            doc = new Document();
            for (short i = 0 ; i < values.length ; i++)
           System.out.println(String.format("Adding document: %s Title %s" , docnum , title));
           // Add the resultant document to the Indexer
