org.xml.sax.InputSource Scala Examples

The following examples show how to use org.xml.sax.InputSource. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: PmmlLoaderKit.scala    From flink-jpmml   with GNU Affero General Public License v3.0 5 votes vote down vote up
package io.radicalbit.flink.pmml.scala.utils

import org.dmg.pmml.PMML
import org.jpmml.model.{ImportFilter, JAXBUtil}
import org.xml.sax.InputSource

trait PmmlLoaderKit {

  protected case object Source {
    val KmeansPmml = "/kmeans.xml"
    val KmeansPmml41 = "/kmeans41.xml"
    val KmeansPmml40 = "/kmeans40.xml"
    val KmeansPmml42 = "/kmeans42.xml"
    val KmeansPmml32 = "/kmeans41.xml"

    val KmeansPmmlEmpty = "/kmeans_empty.xml"
    val KmeansPmmlNoOut = "/kmeans_nooutput.xml"
    val KmeansPmmlStringFields = "/kmeans_stringfields.xml"
    val KmeansPmmlNoOutNoTrg = "/kmeans_nooutput_notarget.xml"
    val NotExistingPath: String = "/not/existing/" + scala.util.Random.nextString(4)
  }

  final protected def getPMMLSource(path: String): String =
    getClass.getResource(path).getPath

  final protected def getPMMLResource(path: String): PMML = {
    val source = scala.io.Source.fromURL(getClass.getResource(path)).reader()
    JAXBUtil.unmarshalPMML(ImportFilter.apply(new InputSource(source)))
  }

} 
Example 2
Source File: AmqpXPathCheckMaterializer.scala    From gatling-amqp-plugin   with Apache License 2.0 5 votes vote down vote up
package ru.tinkoff.gatling.amqp.checks

import java.io.{ByteArrayInputStream, InputStreamReader}

import io.gatling.commons.validation.{safely, _}
import io.gatling.core.check.xpath.{Dom, XPathCheckType, XmlParsers}
import io.gatling.core.check.{CheckMaterializer, Preparer}
import org.xml.sax.InputSource
import ru.tinkoff.gatling.amqp.AmqpCheck
import ru.tinkoff.gatling.amqp.request.AmqpProtocolMessage

class AmqpXPathCheckMaterializer(xmlParsers: XmlParsers)
    extends CheckMaterializer[XPathCheckType, AmqpCheck, AmqpProtocolMessage, Option[Dom]](identity) {
  private val ErrorMapper = "Could not parse response into a DOM Document: " + _

  override protected def preparer: Preparer[AmqpProtocolMessage, Option[Dom]] =
    message =>
      safely(ErrorMapper) {
        message match {
          case AmqpProtocolMessage(_, payload, _) =>
            val in = new ByteArrayInputStream(payload)
            Some(xmlParsers.parse(new InputSource(new InputStreamReader(in)))).success
          case _ => "Unsupported message type".failure
        }
      }
} 
Example 3
Source File: EventDrivenFetcher.scala    From Mycat-spider   with Apache License 2.0 5 votes vote down vote up
package turbo.crawler.power

import java.io.IOException
import java.io.StringReader
import java.net.BindException
import java.net.SocketException
import java.net.SocketTimeoutException

import org.apache.commons.httpclient.ConnectTimeoutException
import org.apache.commons.httpclient.Header
import org.cyberneko.html.parsers.DOMParser
import org.w3c.dom.Document
import org.xml.sax.InputSource

import turbo.crawler.FetchRejectedException
import turbo.crawler.Fetchable
import turbo.crawler.IO
import turbo.crawler.Logable
import turbo.crawler.ResourceHasAlreadyBeenFetchedException
import turbo.crawler.StringAdapter
import turbo.crawler.io.HttpReturns

/**
 * Event driven fetcher
 * @author mclaren
 *
 */
class EventDrivenFetcher[T <: Fetchable](eventId: String) extends Logable with MessageDriven with IO with StringAdapter {
  def fetch(fetchUrl: String  ,
    contentFilter: String => String  ,
    parseDocument: Document => List[T])(hasRejected: Document => Boolean)  = {
    val _retry = (msg: String) => {
      logger.info("Retry " + msg)
      Thread.sleep(3000)
      this.fetch(fetchUrl, contentFilter, parseDocument)(hasRejected)(howToContinue)(referer)
    }
    var httpReturns: HttpReturns = null
    try {
      val dom = new DOMParser
      httpReturns = this.fromUrl(fetchUrl, Array[Header](new Header("Referer", referer(fetchUrl))))
      dom.parse(new InputSource(new StringReader(contentFilter(httpReturns.body))))
      var document = dom.getDocument

      //检查是否被屏蔽
      if (hasRejected(document)) throw new FetchRejectedException(fetchUrl)

      parseDocument(document).foreach(x => fireEvent(new Evt(eventId + "_COMPLETION", x)))

    } catch {
      case e: SocketTimeoutException => _retry(e.getMessage)
      case e: SocketException => _retry(e.getMessage)
      case e: ConnectTimeoutException => _retry(e.getMessage)
      case e: IOException => {
        logger.info("Oh网络错误with代理:" + httpReturns.proxy.ip + ":" + httpReturns.proxy.port)
        howToContinue(fetchUrl, httpReturns.proxy)
        //10秒之内只允许出现一次重拨
        _retry(e.getMessage)
      }
      case e: BindException => _retry(e.getMessage)
      case e: FetchRejectedException => {
        logger.info("Oh 惨遭屏蔽~")
        howToContinue(e.getFetchUrl, httpReturns.proxy)
        //10秒之内只允许出现一次重拨
        _retry(e.getMessage)
      }
      case e: ResourceHasAlreadyBeenFetchedException =>
      case e: Exception => {
        logger.error("Unknown exception has been occurred", e)
      }
    }
  }
} 
Example 4
Source File: Pagination.scala    From Mycat-spider   with Apache License 2.0 5 votes vote down vote up
package turbo.crawler.power

import java.io.IOException
import java.io.StringReader
import java.net.SocketException
import java.net.SocketTimeoutException

import org.apache.commons.httpclient.ConnectTimeoutException
import org.apache.commons.httpclient.Header
import org.cyberneko.html.parsers.DOMParser
import org.w3c.dom.Document
import org.xml.sax.InputSource

import turbo.crawler.FetchRejectedException
import turbo.crawler.Logable
import turbo.crawler.ResourceHasAlreadyBeenFetchedException
import turbo.crawler.io.HttpReturns
import turbo.crawler.io.InternetIO

/**
 * 分页支持
 * @author mclaren
 *
 */
object pages extends Logable with InternetIO {
  def apply(fetchUrl: String, contentFilter: String => String, checkBoundary: Document => Int, urlFactory: (String, Int) => String)(hasRejected: Document => Boolean)(howToContinue: (String, turbo.crawler.io.Proxy) => Unit): List[String] = {
    var value = new ValueRef[Int](0)
    resetBoundary(fetchUrl, value, contentFilter, checkBoundary, urlFactory)(hasRejected)(howToContinue)
    var rts = List[String]()
    value.get
    for (i <- 1 to value.get) {
      rts = rts.+:(urlFactory(fetchUrl, i))
    }
    rts
  }

  private def resetBoundary(fetchUrl: String, lastPage: ValueRef[Int], contentFilter: String => String = x => x, checkBoundary: Document => Int, urlFactory: (String, Int) => String)(hasRejected: Document => Boolean  ): Unit = {
    val _retry = (() => {
      Thread.sleep(3000)
      resetBoundary(fetchUrl, lastPage, contentFilter, checkBoundary, urlFactory)(hasRejected)(howToContinue)
    })

    var httpReturns: HttpReturns = null
    try {
      var domp = new DOMParser
      httpReturns = this.fromUrl(fetchUrl, Array[Header]())
      domp.parse(new InputSource(new StringReader(contentFilter(httpReturns.body))))
      var document = domp.getDocument

      if (hasRejected(document)) throw new FetchRejectedException(fetchUrl, httpReturns.proxy)

      lastPage.set(checkBoundary(document))
    } catch {
      case e: SocketTimeoutException => _retry()
      case e: SocketException => _retry()
      case e: ConnectTimeoutException => _retry()
      case e: IOException => _retry()
      case e: FetchRejectedException => {
        logger.info("Oh 惨遭屏蔽~")
        howToContinue(e.getFetchUrl, httpReturns.proxy)
        _retry()
      }
      case e: ResourceHasAlreadyBeenFetchedException =>
      case e: Exception => {
        logger.error("Unknown exception has been occurred", e)
      }
    }
  }
}
class ValueRef[M](v: M) {
  var value = v
  def set(vv: M) = this.value = vv
  def get = value
} 
Example 5
Source File: PMMLUtils.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.util

import java.io.StringReader

import org.dmg.pmml._
import org.jpmml.model.{ImportFilter, JAXBUtil}
import org.xml.sax.InputSource

/**
 * Testing utils for working with PMML.
 * Predictive Model Markup Language (PMML) is an XML-based file format
 * developed by the Data Mining Group (www.dmg.org).
 */
object PMMLUtils {
  /**
   * :: Experimental ::
   * Load a PMML model from a string. Note: for testing only, PMML model evaluation is supported
   * through external spark-packages.
   */
  def loadFromString(input: String): PMML = {
    val is = new StringReader(input)
    val transformed = ImportFilter.apply(new InputSource(is))
    JAXBUtil.unmarshalPMML(transformed)
  }
}