/* * Copyright (c) 2013, OpenCloudDB/MyCAT and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software;Designed and Developed mainly by many Chinese * opensource volunteers. you can redistribute it and/or modify it under the * terms of the GNU General Public License version 2 only, as published by the * Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Any questions about this component can be directed to it's project Web address * https://code.google.com/p/opencloudb/. * */ package turbo.crawler.power import java.io.IOException import java.io.StringReader import java.net.BindException import java.net.SocketException import java.net.SocketTimeoutException import org.apache.commons.httpclient.ConnectTimeoutException import org.apache.commons.httpclient.Header import org.cyberneko.html.parsers.DOMParser import org.w3c.dom.Document import org.xml.sax.InputSource import turbo.crawler.FetchRejectedException import turbo.crawler.Fetchable import turbo.crawler.IO import turbo.crawler.Logable import turbo.crawler.ResourceHasAlreadyBeenFetchedException import turbo.crawler.StringAdapter import turbo.crawler.io.HttpReturns /** * Event driven fetcher * @author mclaren * */ class EventDrivenFetcher[T <: Fetchable](eventId: String) extends Logable with MessageDriven with IO with StringAdapter { def fetch(fetchUrl: String /* URL */ , contentFilter: String => String /* content filter */ , parseDocument: Document => List[T])(hasRejected: Document => Boolean) /* test if it has been rejected */ (howToContinue: (String, turbo.crawler.io.Proxy) => Unit)(referer: String => String = x => x): Unit /* How to continue */ = { val _retry = (msg: String) => { logger.info("Retry " + msg) Thread.sleep(3000) this.fetch(fetchUrl, contentFilter, parseDocument)(hasRejected)(howToContinue)(referer) } var httpReturns: HttpReturns = null try { val dom = new DOMParser httpReturns = this.fromUrl(fetchUrl, Array[Header](new Header("Referer", referer(fetchUrl)))) dom.parse(new InputSource(new StringReader(contentFilter(httpReturns.body)))) var document = dom.getDocument //检查是否被屏蔽 if (hasRejected(document)) throw new FetchRejectedException(fetchUrl) parseDocument(document).foreach(x => fireEvent(new Evt(eventId + "_COMPLETION", x))) } catch { case e: SocketTimeoutException => _retry(e.getMessage) case e: SocketException => _retry(e.getMessage) case e: ConnectTimeoutException => _retry(e.getMessage) case e: IOException => { logger.info("Oh网络错误with代理:" + httpReturns.proxy.ip + ":" + httpReturns.proxy.port) howToContinue(fetchUrl, httpReturns.proxy) //10秒之内只允许出现一次重拨 _retry(e.getMessage) } case e: BindException => _retry(e.getMessage) case e: FetchRejectedException => { logger.info("Oh 惨遭屏蔽~") howToContinue(e.getFetchUrl, httpReturns.proxy) //10秒之内只允许出现一次重拨 _retry(e.getMessage) } case e: ResourceHasAlreadyBeenFetchedException => case e: Exception => { logger.error("Unknown exception has been occurred", e) } } } }