package it.unimi.di.law.bubing; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Modifier; import java.net.InetAddress; import java.net.URI; import java.net.URL; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.TreeMap; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.regex.Pattern; import org.apache.commons.configuration.ConfigurationException; import org.apache.http.conn.DnsResolver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Charsets; import com.google.common.collect.Iterators; import com.google.common.primitives.Ints; /* * Copyright (C) 2012-2017 Paolo Boldi, Massimo Santini, and Sebastiano Vigna * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import it.unimi.di.law.bubing.frontier.ParsingThread; import it.unimi.di.law.bubing.parser.Parser; import it.unimi.di.law.bubing.spam.SpamDetector; import it.unimi.di.law.bubing.store.Store; import it.unimi.di.law.bubing.util.BURL; import it.unimi.di.law.bubing.util.Link; import it.unimi.di.law.warc.filters.Filter; import it.unimi.di.law.warc.filters.Filters; import it.unimi.di.law.warc.filters.URIResponse; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.FlyweightPrototype; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.lang.ObjectParser; //RELEASE-STATUS: DIST /** Global data shared by all threads. * * <p>All BUbiNG components must share a certain number of global variables, such * as filters and pool of objects. A single instance of this class is created * at agent construction time: it is used to pass around a single reference * to global data. * * <p>All fields in this class are either <code>final</code> or * <code>volatile</code>, depending on whether they can be modified at runtime * (usually by means of JMX methods in {@link Agent}). */ public class RuntimeConfiguration { private static final Logger LOGGER = LoggerFactory.getLogger(RuntimeConfiguration.class); /** Whether to fetch and use <code>robots.txt</code>. This value cannot be configured and it * requires recompilation from the sources. * * <p>You should be better knowing what you are doing if you change this to false. */ public static final boolean FETCH_ROBOTS = true; /** @see StartupConfiguration#name */ public final String name; /** @see StartupConfiguration#group */ public final String group; /** @see StartupConfiguration#weight */ public final int weight; /** @see StartupConfiguration#maxUrlsPerSchemeAuthority */ public final int maxUrlsPerSchemeAuthority; /** @see StartupConfiguration#fetchingThreads */ public volatile int fetchingThreads; /** @see StartupConfiguration#parsingThreads */ public volatile int parsingThreads; /** @see StartupConfiguration#dnsThreads */ public volatile int dnsThreads; /** @see StartupConfiguration#fetchFilter */ public volatile Filter<URI> fetchFilter; /** @see StartupConfiguration#scheduleFilter */ public volatile Filter<Link> scheduleFilter; /** @see StartupConfiguration#parseFilter */ public volatile Filter<URIResponse> parseFilter; /** @see StartupConfiguration#followFilter */ public volatile Filter<URIResponse> followFilter; /** @see StartupConfiguration#storeFilter */ public volatile Filter<URIResponse> storeFilter; /** @see StartupConfiguration#keepAliveTime */ public volatile long keepAliveTime; /** @see StartupConfiguration#schemeAuthorityDelay */ public volatile long schemeAuthorityDelay; /** @see StartupConfiguration#ipDelay */ public volatile long ipDelay; /** @see StartupConfiguration#ipDelayFactor */ public volatile double ipDelayFactor; /** @see StartupConfiguration#maxUrls */ public volatile long maxUrls; /** @see StartupConfiguration#bloomFilterPrecision */ public final double bloomFilterPrecision; /** An iterator returning URIs that are then used as a seed; this iterator <em>may</em> return {@code null} (when * invalid or relative URLs are specified). * @see StartupConfiguration#seed */ public final Iterator<URI> seed; /** @see StartupConfiguration#seed */ public final IntOpenHashSet blackListedIPv4Addresses; /** A lock used to access {@link #blackListedIPv4Addresses}. */ public final ReadWriteLock blackListedIPv4Lock; /** The set of hashes of hosts that should be blacklisted. * @see StartupConfiguration#blackListedHosts */ public final IntOpenHashSet blackListedHostHashes; /** A lock used to access {@link #blackListedHostHashes}. */ public final ReadWriteLock blackListedHostHashesLock; /** @see StartupConfiguration#socketTimeout */ public volatile int socketTimeout; /** @see StartupConfiguration#connectionTimeout */ public volatile int connectionTimeout; /** @see StartupConfiguration#fetchDataBufferByteSize */ public final int fetchDataBufferByteSize; /** @see StartupConfiguration#proxyHost */ public final String proxyHost; /** @see StartupConfiguration#proxyPort */ public final int proxyPort; /** @see StartupConfiguration#cookiePolicy */ public final String cookiePolicy; /** @see StartupConfiguration#cookieMaxByteSize */ public final int cookieMaxByteSize; /** @see StartupConfiguration#userAgent */ public final String userAgent; /** @see StartupConfiguration#userAgentFrom */ public final String userAgentFrom; /** @see StartupConfiguration#robotsExpiration */ public volatile long robotsExpiration; /** @see StartupConfiguration#acceptAllCertificates */ public volatile boolean acceptAllCertificates; /** @see StartupConfiguration#rootDir */ public final File rootDir; /** @see StartupConfiguration#storeDir */ public final File storeDir; /** @see StartupConfiguration#responseCacheDir */ public final File responseCacheDir; /** @see StartupConfiguration#sieveDir */ public final File sieveDir; /** @see StartupConfiguration#frontierDir */ public final File frontierDir; /** @see StartupConfiguration#responseBodyMaxByteSize */ public volatile int responseBodyMaxByteSize; /** @see StartupConfiguration#digestAlgorithm */ public final String digestAlgorithm; /** @see StartupConfiguration#startPaused */ public final boolean startPaused; /** @see StartupConfiguration#storeClass */ public final Class<? extends Store> storeClass; /** @see StartupConfiguration#workbenchMaxByteSize */ public volatile long workbenchMaxByteSize; /** @see StartupConfiguration#virtualizerMaxByteSize */ public final long virtualizerMaxByteSize; /** @see StartupConfiguration#urlCacheMaxByteSize */ public volatile long urlCacheMaxByteSize; /** @see StartupConfiguration#sieveSize */ public final int sieveSize; /** @see StartupConfiguration#sieveStoreIOBufferByteSize */ public final int sieveStoreIOBufferByteSize; /** @see StartupConfiguration#sieveAuxFileIOBufferByteSize */ public final int sieveAuxFileIOBufferByteSize; /** @see StartupConfiguration#dnsCacheMaxSize */ public final int dnsCacheMaxSize; /** @see StartupConfiguration#dnsPositiveTtl */ public final long dnsPositiveTtl; /** @see StartupConfiguration#dnsNegativeTtl */ public final long dnsNegativeTtl; /** @see StartupConfiguration#crawlIsNew */ public final boolean crawlIsNew; /** @see StartupConfiguration#spamDetectorUri */ public final SpamDetector<?> spamDetector; /** @see StartupConfiguration#spamDetectionThreshold */ public final int spamDetectionThreshold; /** @see StartupConfiguration#spamDetectionPeriodicity */ public final int spamDetectionPeriodicity; /** The parser, instantiated. Parsers used by {@link ParsingThread} instances are obtained by {@linkplain FlyweightPrototype#copy() copying this parsers}. */ public final ArrayList<Parser<?>> parsers; /* Global data not depending on a StartupConfiguration. */ /* Global data not initialised at startup. */ /** Whether the crawler is currently paused. When this variable changes to false, * a <code>notifyAll()</code> is issued on this runtime configuration. */ public volatile boolean paused; /** Whether the crawler is currently being stopping. The change of state to true is stable—this * variable will never become false again. */ public volatile boolean stopping; /** The DNS resolver used throughout the crawler. * @see StartupConfiguration#dnsResolverClass */ public final DnsResolver dnsResolver; /** A pattern used to identify hosts specified directed via their address in dotted notation. Note the dot at the end. * It covers both IPv6 addresses (where hexadecimal notation is accepted by default) and IPv4 addresses (where hexadecimal notation * requires the 0x prefix on every single piece of the address). */ public static final Pattern DOTTED_ADDRESS = Pattern.compile("(([0-9A-Fa-f]+[:])*[0-9A-Fa-f]+)|((((0x[0-9A-Fa-f]+)|([0-9]+))\\.)*((0x[0-9A-Fa-f]+)|([0-9]+)))"); private static URI handleSeedURL(final MutableString s) { final URI url = BURL.parse(s); if (url != null) { if (url.isAbsolute()) return url; else LOGGER.error("The seed URL " + s + " is relative"); } else LOGGER.error("The seed URL " + s + " is malformed"); return null; } /** Converts a string specifying an IPv4 address into an integer. The string can be either a single integer (representing * the address) or a dot-separated 4-tuple of bytes. * * @param s the string to be converted. * @return the integer representing the IP address specified by s. * @throws ConfigurationException */ private int handleIPv4(final String s) throws ConfigurationException { try { if (!RuntimeConfiguration.DOTTED_ADDRESS.matcher(s).matches()) throw new ConfigurationException("Malformed IPv4 " + s + " for blacklisting"); // Note that since we're sure this is a dotted-notation address, we pass directly through InetAddress. final byte[] address = InetAddress.getByName(s).getAddress(); if (address.length > 4) throw new UnknownHostException("Not IPv4"); return Ints.fromByteArray(address); } catch (final UnknownHostException e) { throw new ConfigurationException("Malformed IPv4 " + s + " for blacklisting", e); } } /** Adds a (or a set of) new IPv4 to the black list; the IPv4 can be specified directly or it can be a file (prefixed by * <code>file:</code>). * * @param spec the specification (an IP address, or a file prefixed by <code>file</code>). * @throws ConfigurationException * @throws FileNotFoundException */ public void addBlackListedIPv4(final String spec) throws ConfigurationException, FileNotFoundException { if (spec.length() == 0) return; // Skip empty specs if (spec.startsWith("file:")) { final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1))); while (lineIterator.hasNext()) { final MutableString line = lineIterator.next(); if (line.length() > 0) blackListedIPv4Addresses.add(handleIPv4(line.toString())); } } else blackListedIPv4Addresses.add(handleIPv4(spec)); } /** Adds a (or a set of) new host to the black list; the host can be specified directly or it can be a file (prefixed by * <code>file:</code>). * * @param spec the specification (a host, or a file prefixed by <code>file</code>). * @throws ConfigurationException * @throws FileNotFoundException */ public void addBlackListedHost(final String spec) throws ConfigurationException, FileNotFoundException { if (spec.length() == 0) return; // Skip empty specs if (spec.startsWith("file:")) { final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1))); while (lineIterator.hasNext()) { final MutableString line = lineIterator.next(); blackListedHostHashes.add(line.toString().trim().hashCode()); } } else blackListedHostHashes.add(spec.trim().hashCode()); } public RuntimeConfiguration(final StartupConfiguration startupConfiguration) throws ConfigurationException, IOException { try { crawlIsNew = startupConfiguration.crawlIsNew; name = startupConfiguration.name; group = startupConfiguration.group; weight = startupConfiguration.weight; maxUrlsPerSchemeAuthority = startupConfiguration.maxUrlsPerSchemeAuthority; fetchingThreads = startupConfiguration.fetchingThreads; parsingThreads = startupConfiguration.parsingThreads; dnsThreads = startupConfiguration.dnsThreads; fetchFilter = startupConfiguration.fetchFilter; scheduleFilter = startupConfiguration.scheduleFilter; parseFilter = startupConfiguration.parseFilter; followFilter = startupConfiguration.followFilter; storeFilter = startupConfiguration.storeFilter; keepAliveTime = startupConfiguration.keepAliveTime; schemeAuthorityDelay = startupConfiguration.schemeAuthorityDelay; ipDelay = startupConfiguration.ipDelay; ipDelayFactor = startupConfiguration.ipDelayFactor; maxUrls = startupConfiguration.maxUrls; bloomFilterPrecision = startupConfiguration.bloomFilterPrecision; startPaused = startupConfiguration.startPaused; storeClass = startupConfiguration.storeClass; workbenchMaxByteSize = startupConfiguration.workbenchMaxByteSize; virtualizerMaxByteSize = startupConfiguration.virtualizerMaxByteSize; urlCacheMaxByteSize = startupConfiguration.urlCacheMaxByteSize; sieveSize = startupConfiguration.sieveSize & -1 << 3; sieveStoreIOBufferByteSize = startupConfiguration.sieveStoreIOBufferByteSize & -1 << 3; sieveAuxFileIOBufferByteSize = startupConfiguration.sieveStoreIOBufferByteSize & -1 << 3; dnsCacheMaxSize = startupConfiguration.dnsCacheMaxSize; dnsPositiveTtl = startupConfiguration.dnsPositiveTtl; dnsNegativeTtl = startupConfiguration.dnsNegativeTtl; try { dnsResolver = startupConfiguration.dnsResolverClass.getConstructor().newInstance(); } catch (final Exception e) { throw new ConfigurationException(e.getMessage(), e); } if (startupConfiguration.spamDetectorUri.length() > 0) { final InputStream spamDetectorStream = new URL(startupConfiguration.spamDetectorUri).openStream(); spamDetector = (SpamDetector<?>)BinIO.loadObject(spamDetectorStream); spamDetectorStream.close(); } else spamDetector = null; spamDetectionThreshold = startupConfiguration.spamDetectionThreshold; spamDetectionPeriodicity = startupConfiguration.spamDetectionPeriodicity; final List<Iterator<URI>> seedSequence = new ArrayList<>(); for(final String spec : startupConfiguration.seed) { if (spec.length() == 0) continue; // Skip empty lines if (spec.startsWith("file:")) { final LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(spec.substring(5)), Charsets.ISO_8859_1))); seedSequence.add(new Iterator<URI>() { @Override public boolean hasNext() { return lineIterator.hasNext();} @Override public URI next() { return handleSeedURL(lineIterator.next()); } @Override public void remove() { throw new UnsupportedOperationException(); } }); } else seedSequence.add(Iterators.singletonIterator(handleSeedURL(new MutableString(spec)))); } blackListedIPv4Addresses = new IntOpenHashSet(); for(final String spec : startupConfiguration.blackListedIPv4Addresses) addBlackListedIPv4(spec); blackListedIPv4Lock = new ReentrantReadWriteLock(); blackListedHostHashes = new IntOpenHashSet(); for(final String spec : startupConfiguration.blackListedHosts) addBlackListedHost(spec); blackListedHostHashesLock = new ReentrantReadWriteLock(); this.seed = Iterators.concat(seedSequence.iterator()); socketTimeout = startupConfiguration.socketTimeout; connectionTimeout = startupConfiguration.connectionTimeout; rootDir = new File(startupConfiguration.rootDir); storeDir = StartupConfiguration.subDir(startupConfiguration.rootDir, startupConfiguration.storeDir); responseCacheDir = StartupConfiguration.subDir(startupConfiguration.rootDir, startupConfiguration.responseCacheDir); sieveDir = StartupConfiguration.subDir(startupConfiguration.rootDir, startupConfiguration.sieveDir); frontierDir = StartupConfiguration.subDir(startupConfiguration.rootDir, startupConfiguration.frontierDir); fetchDataBufferByteSize = startupConfiguration.fetchDataBufferByteSize; proxyHost = startupConfiguration.proxyHost; proxyPort = startupConfiguration.proxyPort; cookiePolicy = startupConfiguration.cookiePolicy; cookieMaxByteSize = startupConfiguration.cookieMaxByteSize; userAgent = startupConfiguration.userAgent; userAgentFrom = startupConfiguration.userAgentFrom; robotsExpiration = startupConfiguration.robotsExpiration; acceptAllCertificates = startupConfiguration.acceptAllCertificates; responseBodyMaxByteSize = startupConfiguration.responseBodyMaxByteSize; digestAlgorithm = startupConfiguration.digestAlgorithm; parsers = parsersFromSpecs(startupConfiguration.parserSpec); // Try to build parsers just to see if the specs are correct // State setup paused = startPaused; } catch (final IllegalArgumentException e) { throw new ConfigurationException(e); } catch (final ClassNotFoundException e) { throw new ConfigurationException(e); } catch (final IllegalAccessException e) { throw new ConfigurationException(e); } catch (final InvocationTargetException e) { throw new ConfigurationException(e); } catch (final InstantiationException e) { throw new ConfigurationException(e); } catch (final NoSuchMethodException e) { throw new ConfigurationException(e); } if (sieveSize == 0 && followFilter != Filters.FALSE) throw new ConfigurationException("Without a sieve you must specify a FALSE follow filter"); } public void ensureNotPaused() throws InterruptedException { if (! paused) return; boolean waited = false; synchronized(this) { while(paused) { LOGGER.info("Detected pause--going to wait..."); waited = true; wait(); } if (waited) LOGGER.info("Pause terminated."); } } @Override public String toString() { final Class<?> thisClass = getClass(); final TreeMap<String,Object> values = new TreeMap<>(); for (final Field f : thisClass.getDeclaredFields()) { if (ReadWriteLock.class.isAssignableFrom(f.getClass())) continue; if ((f.getModifiers() & Modifier.STATIC) != 0) continue; try { values.put(f.getName(), f.get(this)); } catch (final IllegalAccessException e) { values.put(f.getName(), "<THIS SHOULD NOT HAPPEN>"); } } return values.toString(); } /** Given an array of parser specifications, it returns the corresponding list of parsers (only * the correct specifications are put in the list. * * @param specs the parser specifications (they will be parsed using {@link ObjectParser}. * @return a list of parsers built according to the specifications (only the parseable items are put in the list). */ public static ArrayList<Parser<?>> parsersFromSpecs(String[] specs) throws IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException, IOException { final ArrayList<Parser<?>> parsers = new ArrayList<>(); for(final String spec : specs) parsers.add(ObjectParser.fromSpec(spec, Parser.class, new String[] { "it.unimi.di.law.bubing.parser" })); return parsers; } }