/*******************************************************************************
 * Copyright 2014 A3 lab (Dipartimento di Informatica, Università di Pisa)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.acubelab.tagme.preprocessing.support;

import java.io.IOException;
import java.util.regex.Pattern;

import it.acubelab.PLogger;
import it.acubelab.tagme.preprocessing.Dataset;
import it.acubelab.tagme.preprocessing.WikiPatterns;
import it.acubelab.tagme.preprocessing.WikiPatterns.Type;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.Object2IntMap;

public class IgnoreWIDs extends Dataset<IntSet> {

	public IgnoreWIDs(String lang) {
		super(lang);
	}

	@Override
	protected IntSet parseSet() throws IOException
	{
		log.info("Loading data...");
		Object2IntMap<String> titles = new TitlesToWIDMap(lang).getDataset();
		IntOpenHashSet ids = new IntOpenHashSet(titles.size());
		
		Pattern p_date = WikiPatterns.getPattern(lang, Type.PAGE_DATE);
		Pattern p_other = WikiPatterns.getPattern(lang, Type.PAGE_IGNORE);
		
		PLogger plog = new PLogger(log,"titles","dates","others").setEnd(0, titles.size()).start("Parsing ignore-pages...");
		for(String title : titles.keySet())
		{
			plog.update(0);
			if (p_date.matcher(title).find()) {
				plog.update(1);
				ids.add(titles.get(title));
			}
			else if (p_other.matcher(title).find()) {
				plog.update(2);
				ids.add(titles.get(title));
			}
		}
		plog.stop();
		
		ids.trim();
		return ids;
	}

}