Java Code Examples for org.apache.spark.SparkContext#newAPIHadoopRDD()

The following examples show how to use org.apache.spark.SparkContext#newAPIHadoopRDD() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: GraphXGraphGenerator.java From rya with Apache License 2.0

4 votes

public RDD<Tuple2<Object, RyaTypeWritable>> getVertexRDD(SparkContext sc, Configuration conf) throws IOException, AccumuloSecurityException{
    // Load configuration parameters
    zk = MRUtils.getACZK(conf);
    instance = MRUtils.getACInstance(conf);
    userName = MRUtils.getACUserName(conf);
    pwd = MRUtils.getACPwd(conf);
    mock = MRUtils.getACMock(conf, false);
    tablePrefix = MRUtils.getTablePrefix(conf);
    // Set authorizations if specified
    String authString = conf.get(MRUtils.AC_AUTH_PROP);
    if (authString != null && !authString.isEmpty()) {
        authorizations = new Authorizations(authString.split(","));
        conf.set(ConfigUtils.CLOUDBASE_AUTHS, authString); // for consistency
    }
    else {
        authorizations = AccumuloRdfConstants.ALL_AUTHORIZATIONS;
    }
    // Set table prefix to the default if not set
    if (tablePrefix == null) {
        tablePrefix = RdfCloudTripleStoreConstants.TBL_PRFX_DEF;
        MRUtils.setTablePrefix(conf, tablePrefix);
    }
    // Check for required configuration parameters
    Preconditions.checkNotNull(instance, "Accumulo instance name [" + MRUtils.AC_INSTANCE_PROP + "] not set.");
    Preconditions.checkNotNull(userName, "Accumulo username [" + MRUtils.AC_USERNAME_PROP + "] not set.");
    Preconditions.checkNotNull(pwd, "Accumulo password [" + MRUtils.AC_PWD_PROP + "] not set.");
    Preconditions.checkNotNull(tablePrefix, "Table prefix [" + MRUtils.TABLE_PREFIX_PROPERTY + "] not set.");
    RdfCloudTripleStoreConstants.prefixTables(tablePrefix);
    // If connecting to real accumulo, set additional parameters and require zookeepers
    if (!mock) conf.set(ConfigUtils.CLOUDBASE_ZOOKEEPERS, zk); // for consistency
    // Ensure consistency between alternative configuration properties
    conf.set(ConfigUtils.CLOUDBASE_INSTANCE, instance);
    conf.set(ConfigUtils.CLOUDBASE_USER, userName);
    conf.set(ConfigUtils.CLOUDBASE_PASSWORD, pwd);
    conf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock);
    conf.set(RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX, tablePrefix);

    Job job = Job.getInstance(conf, sc.appName());

    ClientConfiguration clientConfig = new ClientConfiguration().with(ClientProperty.INSTANCE_NAME, instance).with(ClientProperty.INSTANCE_ZK_HOST, zk);

    GraphXInputFormat.setInputTableName(job, EntityCentricIndex.getTableName(conf));
    GraphXInputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd));
    GraphXInputFormat.setZooKeeperInstance(job, clientConfig);
    GraphXInputFormat.setScanAuthorizations(job, authorizations);

    return sc.newAPIHadoopRDD(job.getConfiguration(), GraphXInputFormat.class, Object.class, RyaTypeWritable.class);
}

Example 2

Source File: GraphXGraphGenerator.java From rya with Apache License 2.0

4 votes

public RDD<Tuple2<Object, Edge>> getEdgeRDD(SparkContext sc, Configuration conf) throws IOException, AccumuloSecurityException{
    // Load configuration parameters
    zk = MRUtils.getACZK(conf);
    instance = MRUtils.getACInstance(conf);
    userName = MRUtils.getACUserName(conf);
    pwd = MRUtils.getACPwd(conf);
    mock = MRUtils.getACMock(conf, false);
    tablePrefix = MRUtils.getTablePrefix(conf);
    // Set authorizations if specified
    String authString = conf.get(MRUtils.AC_AUTH_PROP);
    if (authString != null && !authString.isEmpty()) {
        authorizations = new Authorizations(authString.split(","));
        conf.set(ConfigUtils.CLOUDBASE_AUTHS, authString); // for consistency
    }
    else {
        authorizations = AccumuloRdfConstants.ALL_AUTHORIZATIONS;
    }
    // Set table prefix to the default if not set
    if (tablePrefix == null) {
        tablePrefix = RdfCloudTripleStoreConstants.TBL_PRFX_DEF;
        MRUtils.setTablePrefix(conf, tablePrefix);
    }
    // Check for required configuration parameters
    Preconditions.checkNotNull(instance, "Accumulo instance name [" + MRUtils.AC_INSTANCE_PROP + "] not set.");
    Preconditions.checkNotNull(userName, "Accumulo username [" + MRUtils.AC_USERNAME_PROP + "] not set.");
    Preconditions.checkNotNull(pwd, "Accumulo password [" + MRUtils.AC_PWD_PROP + "] not set.");
    Preconditions.checkNotNull(tablePrefix, "Table prefix [" + MRUtils.TABLE_PREFIX_PROPERTY + "] not set.");
    RdfCloudTripleStoreConstants.prefixTables(tablePrefix);
    // If connecting to real accumulo, set additional parameters and require zookeepers
    if (!mock) conf.set(ConfigUtils.CLOUDBASE_ZOOKEEPERS, zk); // for consistency
    // Ensure consistency between alternative configuration properties
    conf.set(ConfigUtils.CLOUDBASE_INSTANCE, instance);
    conf.set(ConfigUtils.CLOUDBASE_USER, userName);
    conf.set(ConfigUtils.CLOUDBASE_PASSWORD, pwd);
    conf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock);
    conf.set(RdfCloudTripleStoreConfiguration.CONF_TBL_PREFIX, tablePrefix);

    Job job = Job.getInstance(conf, sc.appName());

    ClientConfiguration clientConfig = new ClientConfiguration().with(ClientProperty.INSTANCE_NAME, instance).with(ClientProperty.INSTANCE_ZK_HOST, zk);

    RyaInputFormat.setTableLayout(job, TABLE_LAYOUT.SPO);
    RyaInputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd));
    RyaInputFormat.setZooKeeperInstance(job, clientConfig);
    RyaInputFormat.setScanAuthorizations(job, authorizations);
            String tableName = RdfCloudTripleStoreUtils.layoutPrefixToTable(TABLE_LAYOUT.SPO, tablePrefix);
            InputFormatBase.setInputTableName(job, tableName);
    return sc.newAPIHadoopRDD(job.getConfiguration(), GraphXEdgeInputFormat.class, Object.class, Edge.class);
}

Example 3

Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0

4 votes

public static JavaPairRDD<GeoWaveInputKey, SimpleFeature> loadRawRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final RDDOptions rddOpts) throws IOException {
  if (sc == null) {
    LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again.");
    return null;
  }

  if (storeOptions == null) {
    LOGGER.error("Must supply input store to load. Please set storeOptions and try again.");
    return null;
  }

  if (rddOpts == null) {
    LOGGER.error("Must supply valid RDDOptions to load a rdd.");
    return null;
  }

  final Configuration conf = new Configuration(sc.hadoopConfiguration());

  GeoWaveInputFormat.setStoreOptions(conf, storeOptions);

  if (rddOpts.getQuery() != null) {
    GeoWaveInputFormat.setQuery(
        conf,
        rddOpts.getQuery(),
        storeOptions.createAdapterStore(),
        storeOptions.createInternalAdapterStore(),
        storeOptions.createIndexStore());
  }

  if ((rddOpts.getMinSplits() > -1) || (rddOpts.getMaxSplits() > -1)) {
    GeoWaveInputFormat.setMinimumSplitCount(conf, rddOpts.getMinSplits());
    GeoWaveInputFormat.setMaximumSplitCount(conf, rddOpts.getMaxSplits());
  } else {
    final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1);
    // Attempt to grab default partition count for spark and split data
    // along that.
    // Otherwise just fallback to default according to index strategy
    if (defaultSplitsSpark != -1) {
      GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark);
      GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark);
    }
  }

  final RDD<Tuple2<GeoWaveInputKey, SimpleFeature>> rdd =
      sc.newAPIHadoopRDD(
          conf,
          GeoWaveInputFormat.class,
          GeoWaveInputKey.class,
          SimpleFeature.class);

  final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd =
      JavaPairRDD.fromJavaRDD(rdd.toJavaRDD());

  return javaRdd;
}

Example 4

Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0

4 votes

public static JavaPairRDD<GeoWaveInputKey, GridCoverage> loadRawRasterRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final String indexName,
    final Integer minSplits,
    final Integer maxSplits) throws IOException {
  if (sc == null) {
    LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again.");
    return null;
  }

  if (storeOptions == null) {
    LOGGER.error("Must supply input store to load. Please set storeOptions and try again.");
    return null;
  }

  final Configuration conf = new Configuration(sc.hadoopConfiguration());

  GeoWaveInputFormat.setStoreOptions(conf, storeOptions);

  if (indexName != null) {
    GeoWaveInputFormat.setQuery(
        conf,
        QueryBuilder.newBuilder().indexName(indexName).build(),
        storeOptions.createAdapterStore(),
        storeOptions.createInternalAdapterStore(),
        storeOptions.createIndexStore());
  }
  if (((minSplits != null) && (minSplits > -1)) || ((maxSplits != null) && (maxSplits > -1))) {
    GeoWaveInputFormat.setMinimumSplitCount(conf, minSplits);
    GeoWaveInputFormat.setMaximumSplitCount(conf, maxSplits);
  } else {
    final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1);
    // Attempt to grab default partition count for spark and split data
    // along that.
    // Otherwise just fallback to default according to index strategy
    if (defaultSplitsSpark != -1) {
      GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark);
      GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark);
    }
  }

  final RDD<Tuple2<GeoWaveInputKey, GridCoverage>> rdd =
      sc.newAPIHadoopRDD(
          conf,
          GeoWaveInputFormat.class,
          GeoWaveInputKey.class,
          GridCoverage.class);

  final JavaPairRDD<GeoWaveInputKey, GridCoverage> javaRdd =
      JavaPairRDD.fromJavaRDD(rdd.toJavaRDD());

  return javaRdd;
}