org.elasticsearch.hadoop.mr.EsInputFormat Java Examples

The following examples show how to use org.elasticsearch.hadoop.mr.EsInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractMRNewApiSearchTest.java    From elasticsearch-hadoop with Apache License 2.0 6 votes vote down vote up
private Configuration createConf() throws IOException {
    Configuration conf = HdpBootstrap.hadoopConfig();
    HadoopCfgUtils.setGenericOptions(conf);
    Job job = new Job(conf);
    job.setInputFormatClass(EsInputFormat.class);
    job.setOutputFormatClass(PrintStreamOutputFormat.class);
    job.setOutputKeyClass(Text.class);

    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);

    job.setOutputValueClass(mapType);
    conf.set(ConfigurationOptions.ES_QUERY, query);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(readMetadata));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, String.valueOf(readAsJson));

    new QueryTestParams(tempFolder).provisionQueries(conf);
    job.setNumReduceTasks(0);
    //PrintStreamOutputFormat.stream(conf, Stream.OUT);

    Configuration cfg = job.getConfiguration();
    HdpBootstrap.addProperties(cfg, TestSettings.TESTING_PROPS, false);
    return cfg;
}
 
Example #2
Source File: AbstractExtraMRTests.java    From elasticsearch-hadoop with Apache License 2.0 6 votes vote down vote up
private JobConf createReadJobConf() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(EsInputFormat.class);
    conf.setOutputFormat(PrintStreamOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);
    conf.setOutputValueClass(MapWritable.class);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.setNumReduceTasks(0);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(random.nextBoolean()));
    conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, "true");

    FileInputFormat.setInputPaths(conf, new Path(MRSuite.testData.gibberishDat(conf)));
    return conf;
}
 
Example #3
Source File: AbstractMROldApiSearchTest.java    From elasticsearch-hadoop with Apache License 2.0 6 votes vote down vote up
private JobConf createJobConf() throws IOException {
    JobConf conf = HdpBootstrap.hadoopConfig();

    conf.setInputFormat(EsInputFormat.class);
    conf.setOutputFormat(PrintStreamOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    boolean type = random.nextBoolean();
    Class<?> mapType = (type ? MapWritable.class : LinkedMapWritable.class);
    conf.setOutputValueClass(mapType);
    HadoopCfgUtils.setGenericOptions(conf);
    conf.set(ConfigurationOptions.ES_QUERY, query);
    conf.setNumReduceTasks(0);

    conf.set(ConfigurationOptions.ES_READ_METADATA, String.valueOf(readMetadata));
    conf.set(ConfigurationOptions.ES_READ_METADATA_VERSION, String.valueOf(true));
    conf.set(ConfigurationOptions.ES_OUTPUT_JSON, String.valueOf(readAsJson));

    new QueryTestParams(tempFolder).provisionQueries(conf);
    FileInputFormat.setInputPaths(conf, new Path(MRSuite.testData.sampleArtistsDatUri()));

    HdpBootstrap.addProperties(conf, TestSettings.TESTING_PROPS, false);
    return conf;
}
 
Example #4
Source File: ComputeResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaRDD<MapWritable> readDataES() throws IOException, PIRException
{
  logger.info("Reading data ");

  JavaRDD<MapWritable> jsonRDD;

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  jsonRDD = sc.newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values().coalesce(numDataPartitions);

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return jsonRDD.filter(new FilterData(accum, bVars));
  }
  else
  {
    logger.info("qSchema.getFilter() is null");
    return jsonRDD;
  }
}
 
Example #5
Source File: HadoopFormatIOElasticTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Set the Elasticsearch configuration parameters in the Hadoop configuration object.
 * Configuration object should have InputFormat class, key class and value class set. Mandatory
 * fields for ESInputFormat to be set are es.resource, es.nodes, es.port, es.internal.es.version.
 * Please refer to <a
 * href="https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html"
 * >Elasticsearch Configuration</a> for more details.
 */
private Configuration getConfiguration() {
  Configuration conf = new Configuration();
  conf.set(ConfigurationOptions.ES_NODES, ELASTIC_IN_MEM_HOSTNAME);
  conf.set(ConfigurationOptions.ES_PORT, String.format("%s", port));
  conf.set(ConfigurationOptions.ES_RESOURCE, ELASTIC_RESOURCE);
  conf.set("es.internal.es.version", ELASTIC_INTERNAL_VERSION);
  conf.set(ConfigurationOptions.ES_NODES_DISCOVERY, TRUE);
  conf.set(ConfigurationOptions.ES_INDEX_AUTO_CREATE, TRUE);
  conf.setClass("mapreduce.job.inputformat.class", EsInputFormat.class, InputFormat.class);
  conf.setClass("key.class", Text.class, Object.class);
  conf.setClass("value.class", LinkedMapWritable.class, Object.class);
  return conf;
}
 
Example #6
Source File: ESEntityExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
public ESEntityExtractor(Class<T> t) {
    super();
    this.deepJobConfig = new ESDeepJobConfig(t);
    this.inputFormat = new EsInputFormat<>();
    this.outputFormat = new EsOutputFormat();

}
 
Example #7
Source File: ReadFromES.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf(), "ReadFromES");
    // DO NOT SET JAR BY CLASS HERE
    //
    // job.setJarByClass(getClass());

    EsMapReduceUtil.initCredentials(job);

    job.getConfiguration().set("es.output.json", "true");

    job.setInputFormatClass(EsInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    TextOutputFormat.setOutputPath(job, new Path(args[0]));

    job.setMapperClass(MapperImpl.class);
    // Secure Hadoop CANNOT perform shuffle phases without native libraries
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    if (!job.waitForCompletion(true)) {
        return 1;
    }
    return 0;
}
 
Example #8
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 4 votes vote down vote up
/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readDataES() throws IOException
{
  logger.info("Reading data ");

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  // Read data from hdfs
  JavaDStream<MapWritable> mwStream;
  if (useQueueStream)
  {
    Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
    JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values()
        .coalesce(numDataPartitions);
    rddQueue.add(rddIn);

    mwStream = jssc.queueStream(rddQueue);
  }
  else
  {
    JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, EsInputFormat.class);
    mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
    {
      private static final long serialVersionUID = 1L;

      @Override
      public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
      {
        return pair.values();
      }
    }).repartition(numDataPartitions);
  }

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return mwStream.filter(new FilterData(accum, bVars));
  }
  else
  {
    return mwStream;
  }
}
 
Example #9
Source File: ESCellExtractor.java    From deep-spark with Apache License 2.0 4 votes vote down vote up
public ESCellExtractor(Class<Cells> cellsClass) {
    super();
    this.deepJobConfig = new ESDeepJobConfig(cellsClass);
    this.inputFormat = new EsInputFormat<>();
    this.outputFormat = new EsOutputFormat();
}