org.apache.spark.Partition Java Examples

The following examples show how to use org.apache.spark.Partition. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkDatasetBoundedSourceVertex.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Constructor.
 *
 * @param sparkSession sparkSession to recreate on each executor.
 * @param dataset      Dataset to read data from.
 */
public SparkDatasetBoundedSourceVertex(final SparkSession sparkSession, final Dataset<T> dataset) {
  this.readables = new ArrayList<>();
  final RDD rdd = dataset.sparkRDD();
  final Partition[] partitions = rdd.getPartitions();
  for (int i = 0; i < partitions.length; i++) {
    readables.add(new SparkDatasetBoundedSourceReadable(
      partitions[i],
      sparkSession.getDatasetCommandsList(),
      sparkSession.getInitialConf(),
      i));
  }
  this.estimatedByteSize = dataset.javaRDD()
    .map(o -> (long) o.toString().getBytes("UTF-8").length)
    .reduce((a, b) -> a + b);
}
 
Example #2
Source File: GenericHadoopExtractor.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Override
public void initIterator(Partition dp, S config) {

    int id = config.getRddId();

    NewHadoopPartition split = (NewHadoopPartition) dp;

    TaskAttemptID attemptId = DeepSparkHadoopMapReduceUtil
            .newTaskAttemptID(jobTrackerId, id, true, split.index(), 0);

    Configuration configuration = getHadoopConfig(config);

    TaskAttemptContext hadoopAttemptContext = DeepSparkHadoopMapReduceUtil
            .newTaskAttemptContext(configuration, attemptId);

    try {
        reader = inputFormat.createRecordReader(split.serializableHadoopSplit().value(), hadoopAttemptContext);
        reader.initialize(split.serializableHadoopSplit().value(), hadoopAttemptContext);
    } catch (IOException | InterruptedException e) {
        throw new DeepGenericException(e);
    }
}
 
Example #3
Source File: ExtractorClientHandler.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Override
public Partition[] getPartitions(ExtractorConfig<T> config) {

    GetPartitionsAction<T> getPartitionsAction = new GetPartitionsAction<>(config);

    channel.writeAndFlush(getPartitionsAction);

    Response response;
    boolean interrupted = false;
    for (; ; ) {
        try {
            response = answer.take();
            break;
        } catch (InterruptedException ignore) {
            interrupted = true;
        }
    }

    if (interrupted) {
        Thread.currentThread().interrupt();
    }

    return ((GetPartitionsResponse) response).getPartitions();
}
 
Example #4
Source File: ExtractorClientHandler.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Override
public void initIterator(Partition dp, ExtractorConfig<T> config) {
    InitIteratorAction<T> initIteratorAction = new InitIteratorAction<>(dp, config);

    channel.writeAndFlush(initIteratorAction);

    Response response;
    boolean interrupted = false;
    for (; ; ) {
        try {
            response = answer.take();
            break;
        } catch (InterruptedException ignore) {
            interrupted = true;
        }
    }

    if (interrupted) {
        Thread.currentThread().interrupt();
    }
    return;
}
 
Example #5
Source File: JdbcNativeExtractor.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public Partition[] getPartitions(S config) {
    jdbcDeepJobConfig = initConfig(config, jdbcDeepJobConfig);

    int upperBound = jdbcDeepJobConfig.getUpperBound();
    int lowerBound = jdbcDeepJobConfig.getLowerBound();
    int numPartitions = jdbcDeepJobConfig.getNumPartitions();
    int length = 1 + upperBound - lowerBound;
    Partition [] result = new Partition[numPartitions];
    for(int i=0; i<numPartitions; i++) {
        int start = lowerBound + lowerBound + ((i * length) / numPartitions);
        int end = lowerBound + (((i + 1) * length) / numPartitions) - 1;
        result[i] = new JdbcPartition(i, start, end);
    }
    return result;
}
 
Example #6
Source File: JdbcReader.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Initialized the reader
 * 
 * @param p
 *            Spark partition.
 * @throws Exception
 */
public void init(Partition p) throws Exception {
    Class.forName(jdbcDeepJobConfig.getDriverClass());
    conn = DriverManager.getConnection(jdbcDeepJobConfig.getConnectionUrl(),
            jdbcDeepJobConfig.getUsername(),
            jdbcDeepJobConfig.getPassword());
    Statement statement = conn.createStatement();
    SelectQuery query = jdbcDeepJobConfig.getQuery();
    JdbcPartition jdbcPartition = (JdbcPartition)p;
    if(jdbcDeepJobConfig.getNumPartitions() > 1) {
        Column partitionKey = jdbcDeepJobConfig.getPartitionKey();
        query.getWhereClause().addCondition(BinaryCondition.lessThan(partitionKey, jdbcPartition.upper(), true))
                .addCondition(BinaryCondition.greaterThan(partitionKey, jdbcPartition.lower(), true));
    }
    resultSet = statement.executeQuery(query.toString());
    // Fetches first element
    this.hasNext = resultSet.next();
}
 
Example #7
Source File: RangePartitionCoalescer.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public PartitionGroup[] coalesce(int maxPartitions, RDD<?> parent) {
    if (maxPartitions != parent.getNumPartitions()) {
        throw new IllegalArgumentException("Cannot use " + getClass().getSimpleName() +
                " with a different number of partitions to the parent RDD.");
    }
    List<Partition> partitions = Arrays.asList(parent.getPartitions());
    PartitionGroup[] groups = new PartitionGroup[partitions.size()];

    for (int i = 0; i < partitions.size(); i++) {
        Seq<String> preferredLocations = parent.getPreferredLocations(partitions.get(i));
        scala.Option<String> preferredLocation = scala.Option.apply
                (preferredLocations.isEmpty() ? null : preferredLocations.apply(0));
        PartitionGroup group = new PartitionGroup(preferredLocation);
        List<Partition> partitionsInGroup =
                partitions.subList(i, maxEndPartitionIndexes.get(i) + 1);
        group.partitions().append(JavaConversions.asScalaBuffer(partitionsInGroup));
        groups[i] = group;
    }
    return groups;
}
 
Example #8
Source File: SparkTextFileBoundedSourceVertex.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Constructor.
 *
 * @param sparkContext  the spark context.
 * @param inputPath     the path of the target text file.
 * @param numPartitions the number of partitions.
 */
public SparkTextFileBoundedSourceVertex(final SparkContext sparkContext,
                                        final String inputPath,
                                        final int numPartitions) {
  this.readables = new ArrayList<>();
  final Partition[] partitions = sparkContext.textFile(inputPath, numPartitions).getPartitions();
  for (int i = 0; i < partitions.length; i++) {
    readables.add(new SparkTextFileBoundedSourceReadable(
      partitions[i],
      sparkContext.getConf(),
      i,
      inputPath,
      numPartitions));
  }
  this.estimatedSizeBytes = SizeEstimator.estimate(sparkContext.textFile(inputPath, numPartitions));
}
 
Example #9
Source File: CassandraExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Override
public void initIterator(final Partition dp,
                         S config) {

    cassandraJobConfig = initConfig(config, cassandraJobConfig);

    recordReader = initRecordReader((DeepPartition) dp, cassandraJobConfig);
}
 
Example #10
Source File: JdbcNativeExtractorTest.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitions() {
    JdbcNativeExtractor extractor = createJdbcNativeExtractor();

    Partition [] partitions = extractor.getPartitions(createJdbcDeepJobConfig());
    assertEquals(partitions.length, NUM_PARTITIONS);

    JdbcPartition partition0 = (JdbcPartition)partitions[0];
    assertEquals(partition0.index(), 0);
    assertEquals(partition0.lower(), 0);
    assertEquals(partition0.upper(), Integer.MAX_VALUE - 1);
}
 
Example #11
Source File: JdbcNeo4JReader.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public void init(Partition p) throws Exception {
    Class.forName(Driver.class.getCanonicalName());
    conn = DriverManager.getConnection(jdbcNeo4JDeepJobConfig.getConnectionUrl(),
            jdbcNeo4JDeepJobConfig.getUsername(),
            jdbcNeo4JDeepJobConfig.getPassword());
    Statement statement = conn.createStatement();
    String query = jdbcNeo4JDeepJobConfig.getCypherQuery();
    resultSet = statement.executeQuery(query);
    // Fetches first element
    this.hasNext = resultSet.next();
}
 
Example #12
Source File: CassandraExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
     * Returns the partitions on which this RDD depends on.
     * <p/>
     * Uses the underlying CqlPagingInputFormat in order to retrieve the splits.
     * <p/>
     * The number of splits, and hence the number of partitions equals to the number of tokens configured in
     * cassandra.yaml + 1.
     */

    @Override
    public Partition[] getPartitions(S config) {

        cassandraJobConfig = initConfig(config, cassandraJobConfig);

        List<DeepTokenRange> underlyingInputSplits = null;
        if(isFilterdByKey(cassandraJobConfig.getFilters(), cassandraJobConfig.fetchTableMetadata().getPartitionKey()
                .get(0).getName())) {

            underlyingInputSplits = new ArrayList<>();
            underlyingInputSplits.add(new DeepTokenRange(Long.MIN_VALUE, Long.MAX_VALUE, cassandraJobConfig.getHostList()));
        }else{

            if (cassandraJobConfig.isBisectModeSet()) {
                underlyingInputSplits = RangeUtils.getSplits(cassandraJobConfig);
            } else {
                underlyingInputSplits = ThriftRangeUtils.build(cassandraJobConfig).getSplits();
            }
        }
            Partition[] partitions = new DeepPartition[underlyingInputSplits.size()];

            int i = 0;

            for (DeepTokenRange split : underlyingInputSplits) {
                partitions[i] = new DeepPartition(cassandraJobConfig.getRddId(), i, split);
//                log().debug("Detected partition: " + partitions[i]);
                ++i;
            }

            return partitions;

    }
 
Example #13
Source File: JdbcNativeExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public void initIterator(Partition dp, S config) {
    jdbcDeepJobConfig = initConfig(config, jdbcDeepJobConfig);
    this.jdbcReader = new JdbcReader(jdbcDeepJobConfig);
    try {
        this.jdbcReader.init(dp);
    } catch(Exception e) {
        throw new DeepGenericException("Unable to initialize JdbcReader", e);
    }
}
 
Example #14
Source File: JdbcNeo4JNativeExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public void initIterator(Partition dp, S config) {
    jdbcNeo4JDeepJobConfig = initConfig(config, jdbcNeo4JDeepJobConfig);
    this.jdbcReader = new JdbcNeo4JReader(jdbcNeo4JDeepJobConfig);
    try {
        this.jdbcReader.init(dp);
    } catch(Exception e) {
        throw new DeepGenericException("Unable to initialize JdbcReader", e);
    }
}
 
Example #15
Source File: JdbcNeo4JNativeExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public Partition[] getPartitions(S config){
    JdbcNeo4JDeepJobConfig neo4jConfig = (JdbcNeo4JDeepJobConfig)config;
    JdbcPartition partition = new JdbcPartition(0, neo4jConfig.getLowerBound(), neo4jConfig.getUpperBound());
    Partition [] result = new Partition[1];
    result[0] = partition;
    return result;
}
 
Example #16
Source File: MongoReader.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Init void.
 *
 * @param partition the partition
 */
public void init(Partition partition) {
    try {

        List<ServerAddress> addressList = new ArrayList<>();

        for (String s : (List<String>) ((DeepPartition) partition).splitWrapper().getReplicas()) {
            addressList.add(new ServerAddress(s));
        }

        //Credentials
        List<MongoCredential> mongoCredentials = new ArrayList<>();

        if (mongoDeepJobConfig.getUsername() != null && mongoDeepJobConfig.getPassword() != null) {
            MongoCredential credential = MongoCredential.createMongoCRCredential(mongoDeepJobConfig.getUsername(),
                    mongoDeepJobConfig.getDatabase(),
                    mongoDeepJobConfig.getPassword().toCharArray());
            mongoCredentials.add(credential);

        }

        mongoClient = new MongoClient(addressList, mongoCredentials);
        mongoClient.setReadPreference(ReadPreference.valueOf(mongoDeepJobConfig.getReadPreference()));
        db = mongoClient.getDB(mongoDeepJobConfig.getDatabase());
        collection = db.getCollection(mongoDeepJobConfig.getCollection());

        dbCursor = collection.find(generateFilterQuery((MongoPartition) partition),
                mongoDeepJobConfig.getDBFields());

    } catch (UnknownHostException e) {
        throw new DeepExtractorInitializationException(e);
    }
}
 
Example #17
Source File: MongoNativeExtractor.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Override
public void initIterator(Partition dp, S config) {

    mongoDeepJobConfig = initConfig(config, mongoDeepJobConfig);

    reader = new MongoReader(mongoDeepJobConfig);
    reader.init(dp);
}
 
Example #18
Source File: DeepRDD.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Override
public Iterator<T> compute(Partition split, TaskContext context) {

    initExtractorClient();

    extractorClient.initIterator(split, config.getValue());

    context.addTaskCompletionListener(new AbstractFunction1<TaskContext, BoxedUnit>() {

        @Override
        public BoxedUnit apply(TaskContext v1) {
            extractorClient.close();
            return null;
        }
    });

    java.util.Iterator<T> iterator = new java.util.Iterator<T>() {

        @Override
        public boolean hasNext() {
            return extractorClient.hasNext();
        }

        @Override
        public T next() {
            return extractorClient.next();
        }

        @Override
        public void remove() {
            throw new DeepIOException(
                    "Method not implemented (and won't be implemented anytime soon!!!)");
        }
    };

    return new InterruptibleIterator<>(context, asScalaIterator(iterator));

}
 
Example #19
Source File: DeepRDD.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Override
public Seq<String> getPreferredLocations(Partition split) {
    initExtractorClient();

    List<String> locations = extractorClient.getPreferredLocations(split);
    if (locations == null || locations.isEmpty()) {
        return super.getPreferredLocations(split);
    }

    return asScalaBuffer(locations);

}
 
Example #20
Source File: HBasePartitioner.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public void initialize() {
    List<Partition> partitions = Arrays.asList(((SparkDataSet) dataSet).rdd.rdd().partitions());
    tableSplits = new ArrayList<>(partitions.size());
    for (Partition p : partitions) {
        NewHadoopPartition nhp = (NewHadoopPartition) p;
        SMSplit sms = (SMSplit) nhp.serializableHadoopSplit().value();
        TableSplit ts = sms.getSplit();
        if (ts.getStartRow() != null && Bytes.equals(ts.getStartRow(),ts.getEndRow()) && ts.getStartRow().length > 0) {
            // this would be an empty partition, with the same start and end key, so don't add it
            continue;
        }
        tableSplits.add(ts);
    }
}
 
Example #21
Source File: SparkDatasetBoundedSourceVertex.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Constructor.
 *
 * @param partition          the partition to wrap.
 * @param commands           list of commands needed to build the dataset.
 * @param sessionInitialConf spark session's initial configuration.
 * @param partitionIndex     partition for this readable.
 */
private SparkDatasetBoundedSourceReadable(final Partition partition,
                                          final LinkedHashMap<String, Object[]> commands,
                                          final Map<String, String> sessionInitialConf,
                                          final int partitionIndex) {
  this.commands = commands;
  this.sessionInitialConf = sessionInitialConf;
  this.partitionIndex = partitionIndex;
  this.locations = SparkSourceUtil.getPartitionLocation(partition);
}
 
Example #22
Source File: SparkTextFileBoundedSourceVertex.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Constructor.
 *
 * @param partition      the partition to wrap.
 * @param sparkConf      configuration needed to build the SparkContext.
 * @param partitionIndex partition for this readable.
 * @param inputPath      the input file path.
 * @param numPartitions  the total number of partitions.
 */
private SparkTextFileBoundedSourceReadable(final Partition partition,
                                           final SparkConf sparkConf,
                                           final int partitionIndex,
                                           final String inputPath,
                                           final int numPartitions) {
  this.sparkConf = sparkConf;
  this.partitionIndex = partitionIndex;
  this.inputPath = inputPath;
  this.numPartitions = numPartitions;
  this.locations = SparkSourceUtil.getPartitionLocation(partition);
}
 
Example #23
Source File: ExtractorServerHandler.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
protected Partition[] getPartitions(GetPartitionsAction<T> getPartitionsAction) {

        if (extractor == null) {
            this.initExtractor(getPartitionsAction.getConfig());
        }

        return extractor.getPartitions(getPartitionsAction.getConfig());
    }
 
Example #24
Source File: SparkSourceUtil.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the source location of a Spark partition.
 *
 * @param partition the partition to get location.
 * @return a list of locations.
 * @throws RuntimeException if failed to get source location.
 */
static List<String> getPartitionLocation(final Partition partition) {
  try {
    if (partition instanceof HadoopPartition) {
      final Field inputSplitField = partition.getClass().getDeclaredField("inputSplit");
      inputSplitField.setAccessible(true);
      final InputSplit inputSplit = (InputSplit) ((SerializableWritable) inputSplitField.get(partition)).value();

      final String[] splitLocations = inputSplit.getLocations();
      final List<String> parsedLocations = new ArrayList<>();

      for (final String loc : splitLocations) {
        final String canonicalHostName = InetAddress.getByName(loc).getCanonicalHostName();
        parsedLocations.add(canonicalHostName);
      }

      if (parsedLocations.size() == 1 && parsedLocations.get(0).equals("localhost")) {
        return Collections.emptyList();
      } else {
        return parsedLocations;
      }
    } else {
      return Collections.emptyList();
    }
  } catch (final Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example #25
Source File: SourceRDD.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.collection.Iterator<scala.Tuple2<Source<T>, CheckpointMarkT>> compute(
    Partition split, TaskContext context) {
  @SuppressWarnings("unchecked")
  CheckpointableSourcePartition<T, CheckpointMarkT> partition =
      (CheckpointableSourcePartition<T, CheckpointMarkT>) split;
  scala.Tuple2<Source<T>, CheckpointMarkT> tuple2 =
      new scala.Tuple2<>(partition.getSource(), partition.checkpointMark);
  return JavaConversions.asScalaIterator(Collections.singleton(tuple2).iterator());
}
 
Example #26
Source File: SourceRDD.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public Partition[] getPartitions() {
  try {
    final List<? extends Source<T>> partitionedSources = microbatchSource.split(options.get());
    final Partition[] partitions = new CheckpointableSourcePartition[partitionedSources.size()];
    for (int i = 0; i < partitionedSources.size(); i++) {
      partitions[i] =
          new CheckpointableSourcePartition<>(
              id(), i, partitionedSources.get(i), EmptyCheckpointMark.get());
    }
    return partitions;
  } catch (Exception e) {
    throw new RuntimeException("Failed to create partitions.", e);
  }
}
 
Example #27
Source File: MizoRDD.java    From mizo with Apache License 2.0 5 votes vote down vote up
@Override
public scala.collection.Iterator<TReturn> compute(Partition split, TaskContext context) {
    String regionEdgesFamilyPath = this.regionsPaths.get(split.index());
    log.info("Running Mizo on region #{} located at: {}", split.index(), regionEdgesFamilyPath);

    return createRegionIterator(createRegionRelationsIterator(regionEdgesFamilyPath));
}
 
Example #28
Source File: MizoRDD.java    From mizo with Apache License 2.0 5 votes vote down vote up
@Override
public Partition[] getPartitions() {
    return Iterators.toArray(IntStream
            .range(0, this.regionsPaths.size())
            .mapToObj(i -> (Partition) () -> i)
            .iterator(), Partition.class);
}
 
Example #29
Source File: SourceRDD.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public scala.collection.Iterator<WindowedValue<T>> compute(
    final Partition split, final TaskContext context) {
  final MetricsContainer metricsContainer = metricsAccum.value().getContainer(stepName);

  @SuppressWarnings("unchecked")
  final BoundedSource.BoundedReader<T> reader = createReader((SourcePartition<T>) split);

  final Iterator<WindowedValue<T>> readerIterator =
      new ReaderToIteratorAdapter<>(metricsContainer, reader);

  return new InterruptibleIterator<>(context, JavaConversions.asScalaIterator(readerIterator));
}
 
Example #30
Source File: MongoNativeExtractor.java    From deep-spark with Apache License 2.0 4 votes vote down vote up
@Override
public List<String> getPreferredLocations(Partition split) {
    return removeAddressPort(((DeepPartition) split).splitWrapper().getReplicas());
}