package it.unimi.di.law.bubing.util;

/*
 * Copyright (C) 2013-2017 Paolo Boldi, Massimo Santini, and Sebastiano Vigna
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//RELEASE-STATUS: DIST

import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;

import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import java.util.Random;
import java.util.Set;

import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectException;
import org.junit.After;
import org.junit.Test;

public class URLRespectsRobotsTest {

	SimpleFixedHttpProxy proxy;


	@After
	public void tearDownProxy() throws InterruptedException, IOException {
		if (proxy != null) proxy.stopService();
	}

	@Test
	public void testDisallowEverytingSync() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL = URI.create("http://foo.bar/robots.txt");
		proxy.add200(robotsURL, "",
				"# go away\n" +
						"User-agent: *\n" +
						"Disallow: /\n"
				);
		final URI disallowedUri1 = URI.create("http://foo.bar/goo/zoo.html"); // Disallowed
		final URI disallowedUri2 = URI.create("http://foo.bar/gaa.html"); // Disallowed
		final URI disallowedUri3 = URI.create("http://foo.bar/"); // Disallowed
		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), false);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(robotsURL, httpClient, null, null, true);
		char[][] filter = URLRespectsRobots.parseRobotsResponse(fetchData, "any");
		assertFalse(URLRespectsRobots.apply(filter, disallowedUri1));
		assertFalse(URLRespectsRobots.apply(filter, disallowedUri2));
		assertFalse(URLRespectsRobots.apply(filter, disallowedUri3));
	}

	@Test
	public void testAllowDisallowSync() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL = URI.create("http://foo.bur/robots.txt");
		proxy.add200(robotsURL, "",
				"# goodguy can do anything\n" +
				"User-agent: goodguy\n" +
				"Disallow: \n\n" +
				"# badguy can do nothing\n" +
				"User-agent: badguy\n" +
				"Disallow: /\n"
		);

		final URI url = URI.create("http://foo.bur/goo/zoo.html"); // Disallowed

		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), false);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(robotsURL, httpClient, null, null, true);

		assertTrue(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy"), url));
		assertTrue(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy foo"), url));
		assertFalse(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "badGuy"), url));
		assertFalse(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "badGuy foo"), url));
	}

	@Test
	public void testAllowOnlySync() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL = URI.create("http://foo.bor/robots.txt");
		proxy.add200(robotsURL, "",
				"# goodguy can do anything\n" +
				"User-agent: goodguy\n" +
				"Disallow:\n\n" +
				"# every other guy can do nothing\n" +
				"User-agent: *\n" +
				"Disallow: /\n"
		);
		final URI url = URI.create("http://foo.bor/goo/zoo.html"); // Disallowed
		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), false);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(robotsURL, httpClient, null, null, true);
		assertTrue(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy"), url));
		assertTrue(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy foo"), url));
		assertFalse(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "badGuy"), url));
		assertFalse(URLRespectsRobots.apply(URLRespectsRobots.parseRobotsResponse(fetchData, "badGuy foo"), url));
	}

	@Test
	public void test4xxSync() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL = URI.create("http://foo.bor/robots.txt");
		proxy.addNon200(robotsURL, "HTTP/1.1 404 Not Found\n",
				"# goodguy can do anything\n" +
				"User-agent: goodguy\n" +
				"Disallow:\n\n" +
				"# every other guy can do nothing\n" +
				"User-agent: *\n" +
				"Disallow: /\n"
		);
		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), false);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(robotsURL, httpClient, null, null, true);
		assertEquals(0, URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy").length);
		assertEquals(0, URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy foo").length);
	}


	@Test
	public void testComplexSync() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL = URI.create("http://foo.bor/robots.txt");
		proxy.add200(robotsURL, "",
				"# every other guy can do nothing\n" +
				"User-agent: *\n" +
				"Disallow: /y\n" +
				"Disallow: /a\n" +
				"Disallow: /c/d\n" +
				"Disallow: /e\n"
		);
		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), false);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(robotsURL, httpClient, null, null, true);
		final char[][] filter = URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy");
		assertTrue(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/c")));
		assertTrue(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/d")));
		assertFalse(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/c/d")));
		assertTrue(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/c/e")));
		assertTrue(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/@")));
		assertTrue(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/x")));
		assertTrue(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/z")));
		assertFalse(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/a")));
		assertFalse(URLRespectsRobots.apply(filter, URI.create("http://foo.bor/a/b")));
	}


	@Test
	public void testRedirectSync() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL0 = URI.create("http://foo.bar/robots.txt");
		URI robotsURL1 = URI.create("http://foo.bar/fubar/robots.txt");

		proxy.addNon200(robotsURL0, "HTTP/1.1 301 Moved Permanently\nLocation: " + robotsURL1 + "\n", "");
		proxy.add200(robotsURL1, "",
				"# goodguy can do anything\n" +
				"User-agent: goodguy\n" +
				"Disallow:\n\n" +
				"# every other guy can do nothing\n" +
				"User-agent: *\n" +
				"Disallow: /\n"
		);
		URI url = URI.create("http://foo.bar/goo/zoo.html"); // Disallowed
		proxy.add200(url, "", "Should not be crawled...");

		proxy.addNon200(URI.create("http://too.many/robots.txt"), "HTTP/1.1 301 Moved Permanently\nLocation: http://too.many/0\n", "");
		for(int i = 0; i < 5; i++) proxy.addNon200(URI.create("http://too.many/" + i), "HTTP/1.1 301 Moved Permanently\nLocation: http://too.many/" + (i + 1) + "\n", "");

		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), true);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));

		fetchData.fetch(URI.create(BURL.schemeAndAuthority(url) + "/robots.txt"), httpClient, null, null, true);
		char[][] filter = URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy");
		assertTrue(URLRespectsRobots.apply(filter, url));
		filter = URLRespectsRobots.parseRobotsResponse(fetchData, "badGuy");
		assertFalse(URLRespectsRobots.apply(filter, url));

		filter = URLRespectsRobots.parseRobotsResponse(fetchData, "goodGuy foo");
		assertTrue(URLRespectsRobots.apply(filter, url));
		filter = URLRespectsRobots.parseRobotsResponse(fetchData, "badGuy foo");
		assertFalse(URLRespectsRobots.apply(filter, url));

		fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(URI.create("http://too.many/robots.txt"), httpClient, null, null, true);
		assertTrue(fetchData.exception instanceof RedirectException);

		fetchData.close();
	}


	public void testLiebert() throws IOException {
		char[][] robots;

		robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /\n\nUser-agent: Googlebot\nDisallow: /action\n"), "BUbiNG");
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://online.liebertpub.com/doi/abs/10.1089/dna.2012.1756")));

		robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /\nDisallow: /action\n"), "BUbiNG");
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://online.liebertpub.com/doi/abs/10.1089/dna.2012.1756")));
	}

	@Test
	public void testLiebert2() throws IOException {
		char[][] robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /\n\nUser-agent: Googlebot\nDisallow: /action\n"), "BUbiNG");
		assertEquals(1, robots.length);
		assertEquals("/", new String(robots[0]));
	}

	@Test
	public void testPrefixes() throws IOException {
		char[][] robots;

		robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /a\nDisallow: /a/a\nDisallow: /a/a\n"), "BUbiNG");
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/")));
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://example.com/a")));
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/b")));
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://example.com/a/b")));

		robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /a/b\nDisallow: /a\nDisallow: /b/c\nDisallow: /b\n"), "BUbiNG");
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/c")));
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://example.com/a")));
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://example.com/b")));

		robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /\nUser-agent: foo\nDisallow:\n"), "BUbiNG");
		assertArrayEquals(new char[][] { "/".toCharArray() }, robots);
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://example.com/")));
	}

	@Test
	public void testDisallowStar() throws IOException{
		char[][] robots;

		robots = URLRespectsRobots.parseRobotsReader(new StringReader("User-agent: *\nDisallow: /*buy_now*\n"), "BUbiNG");
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/hi")));
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/hi_buy_now")));
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/buy_now_hi")));
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/buy_now/page")));
		assertTrue(URLRespectsRobots.apply(robots, URI.create("http://example.com/page/buy_now")));
	}


	@Test
	public void testEmptyString() {
		char[][] robots = URLRespectsRobots.toSortedPrefixFreeCharArrays(new ObjectOpenHashSet<>(new String[] { "/", "" }));
		assertArrayEquals(new char[][] {{}}, robots);
		assertFalse(URLRespectsRobots.apply(robots, URI.create("http://example.com/")));
	}

	@Test
	public void testPrefixesDeep() {
		Set<String> inset = new ObjectOpenHashSet<>();
		Set<String> pfset = new ObjectOpenHashSet<>();
		Random rand = new Random(0);
		for (int i = 100; i < 999; i++) {
			if (rand.nextDouble() < 0.3) {
				String commonPref = String.valueOf(i);
				boolean putPrefix = rand.nextDouble() < 0.9;
				if (putPrefix) {
					pfset.add(commonPref);
					inset.add(commonPref);
				}
				for (int j = 100; j < 450; j++) {
					if (rand.nextDouble() < 0.3) {
						inset.add(commonPref + j);
						if (! putPrefix) pfset.add(commonPref + j);
					}
				}
			}
		}
		char[][] resultArray = URLRespectsRobots.toSortedPrefixFreeCharArrays(inset);
		Set<String> result = new ObjectOpenHashSet<>();
		for (char[] a: resultArray) result.add(new String(a));
		assertEquals(result, pfset);


	}
	
	@Test
	public void testDisallowEverytingWithUTFBOM() throws Exception {
		proxy = new SimpleFixedHttpProxy();
		URI robotsURL = URI.create("http://foo.bar/robots.txt");
		proxy.add200(robotsURL, "",
				"\ufeff"+
						"User-agent: *\n" +
						"Disallow: /\n\n"
				);
		final URI disallowedUri1 = URI.create("http://foo.bar/goo/zoo.html"); // Disallowed
		final URI disallowedUri2 = URI.create("http://foo.bar/gaa.html"); // Disallowed
		final URI disallowedUri3 = URI.create("http://foo.bar/"); // Disallowed
		proxy.start();

		HttpClient httpClient = FetchDataTest.getHttpClient(new HttpHost("localhost", proxy.port()), false);

		FetchData fetchData = new FetchData(Helpers.getTestConfiguration(this));
		fetchData.fetch(robotsURL, httpClient, null, null, true);
		char[][] filter = URLRespectsRobots.parseRobotsResponse(fetchData, "any");
		assertFalse(URLRespectsRobots.apply(filter, disallowedUri1));
		assertFalse(URLRespectsRobots.apply(filter, disallowedUri2));
		assertFalse(URLRespectsRobots.apply(filter, disallowedUri3));
	}

}