package it.unimi.di.law.bubing.parser;

/*
 * Copyright (C) 2004-2017 Paolo Boldi, Massimo Santini, and Sebastiano Vigna
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import it.unimi.di.law.bubing.parser.Parser.TextProcessor;
import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.shorts.Short2ShortOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.lang.MutableString;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;

import org.apache.commons.io.input.CharSequenceReader;

// RELEASE-STATUS: DIST

/** An implementation of a {@link Parser.TextProcessor} that accumulates the counts of terms from a given set specified via a
 * {@link StringMap}. */
public final class SpamTextProcessor implements TextProcessor<SpamTextProcessor.TermCount> {
	public final static class TermCount extends Short2ShortOpenHashMap {
		private static final long serialVersionUID = 1L;
	};

	private final FastBufferedReader fbr = new FastBufferedReader();
	private final TermCount termCount = new TermCount();
	private final Object2LongFunction<MutableString> termSetOnthology;

	public SpamTextProcessor(Object2LongFunction<MutableString> termSetOnthology) {
		this.termSetOnthology = termSetOnthology;
	}

	@SuppressWarnings("unchecked")
	public SpamTextProcessor(final String termSetOnthologyURI) throws ClassNotFoundException, MalformedURLException, IOException {
		 termSetOnthology = (Object2LongFunction<MutableString>)BinIO.loadObject(new URL(termSetOnthologyURI).openStream());
	}

	private void process() throws IOException {
		final MutableString word = new MutableString(), nonWord = new MutableString();
		while (fbr.next(word, nonWord)) {
			final short index = (short)termSetOnthology.getLong(word.toLowerCase());
			if (index != -1) {
				final short oldValue = termCount.get(index);
				if (oldValue < Short.MAX_VALUE) termCount.put(index, (short)(oldValue + 1));
			}
		}
	}

	@Override
	public Appendable append(CharSequence csq) throws IOException {
		fbr.setReader(new CharSequenceReader(csq));
		process();
		return this;
	}

	@Override
	public Appendable append(CharSequence csq, int start, int end) throws IOException {
		fbr.setReader(new CharSequenceReader(csq.subSequence(start, end)));
		process();
		return this;
	}

	@Override
	public Appendable append(char c) throws IOException {
		final short index = (short)termSetOnthology.getLong(new MutableString().append(Character.toLowerCase(c)));
		if (index != -1) {
			final short oldValue = termCount.get(index);
			if (oldValue < Short.MAX_VALUE) termCount.put(index, (short)(oldValue + 1));
		}

		return this;
	}

	@Override
	public void init(URI responseUrl) {
		termCount.clear();
	}

	@Override
	public TermCount result() {
		return termCount;
	}

	@Override
	public TextProcessor<TermCount> copy() {
		return new SpamTextProcessor(termSetOnthology);
	}
}