org.apache.spark.TaskContext Java Exaples

Source File: TestCopyOnWriteActionExecutor.java From hudi with Apache License 2.0

6 votes

@Test
public void testMakeNewPath() throws Exception {
  String fileName = UUID.randomUUID().toString();
  String partitionPath = "2016/05/04";

  String instantTime = HoodieTestUtils.makeNewCommitTime();
  HoodieWriteConfig config = makeHoodieClientConfig();
  metaClient = HoodieTableMetaClient.reload(metaClient);
  HoodieTable table = HoodieTable.create(metaClient, config, hadoopConf);

  Pair<Path, String> newPathWithWriteToken = jsc.parallelize(Arrays.asList(1)).map(x -> {
    HoodieRecord record = mock(HoodieRecord.class);
    when(record.getPartitionPath()).thenReturn(partitionPath);
    String writeToken = FSUtils.makeWriteToken(TaskContext.getPartitionId(), TaskContext.get().stageId(),
        TaskContext.get().taskAttemptId());
    HoodieCreateHandle io = new HoodieCreateHandle(config, instantTime, table, partitionPath, fileName, supplier);
    return Pair.of(io.makeNewPath(record.getPartitionPath()), writeToken);
  }).collect().get(0);

  assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath,
      FSUtils.makeDataFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString());
}

Source File: SpliceOutputCommitter.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@Override
public void setupTask(TaskAttemptContext taskContext) throws IOException {

    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG,"setupTask");
    // Create child additive transaction so we don't read rows inserted by ourselves in this operation
    TaskContext sparkTaskContext = TaskContext.get();
    TaskId taskId = null;
    if (sparkTaskContext != null) {
        int stageId = sparkTaskContext.stageId();
        int partitionId = sparkTaskContext.partitionId();
        int attemptNumber = sparkTaskContext.attemptNumber();
        taskId = new TaskId(stageId, partitionId, attemptNumber);
    }
    TxnView txn = SIDriver.driver().lifecycleManager().beginChildTransaction(parentTxn, parentTxn.getIsolationLevel(),
            true, destinationTable, false, taskId);
    ActiveWriteTxn childTxn = new ActiveWriteTxn(txn.getTxnId(), txn.getTxnId(), parentTxn, true, parentTxn.getIsolationLevel(), taskId);
    currentTxn.set(childTxn);
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG,"beginTxn=%s and destinationTable=%s",childTxn,destinationTable);

}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

6 votes

private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}

Source File: StreamNodeLoader.java From sylph with Apache License 2.0

6 votes

private static Sink<JavaRDD<Row>> loadRealTimeSink(RealTimeSink realTimeSink)
{
    return (Sink<JavaRDD<Row>>) rdd -> rdd.foreachPartition(partition -> {
        Throwable errorOrNull = null;
        try {
            int partitionId = TaskContext.getPartitionId();
            boolean openOK = realTimeSink.open(partitionId, 0); //初始化 返回是否正常 如果正常才处理数据
            if (openOK) {
                partition.forEachRemaining(row -> realTimeSink.process(SparkRecord.make(row)));
            }
        }
        catch (Exception e) {
            errorOrNull = e; //open出错了
        }
        finally {
            realTimeSink.close(errorOrNull); //destroy()
        }
    });
}

Source File: StreamNodeLoader.java From sylph with Apache License 2.0

6 votes

public static Iterator<Row> transFunction(Iterator<Row> partition, RealTimeTransForm realTimeTransForm)
{
    Exception errorOrNull = null;
    Schema schema = realTimeTransForm.getSchema(); // if not null
    List<Row> list = new ArrayList<>();
    try {
        int partitionId = TaskContext.getPartitionId();
        if (realTimeTransForm.open(partitionId, 0)) {
            partition.forEachRemaining(row -> {
                realTimeTransForm.process(SparkRecord.make(row), (transOutrow) -> {
                    //TODO: SparkRow.parserRow(x) with schema ?
                    list.add(SparkRecord.parserRow(transOutrow));
                });
            });
        }
    }
    catch (Exception e) {
        errorOrNull = e; //转换失败 这批数据都丢弃
    }
    finally {
        realTimeTransForm.close(errorOrNull); //destroy()
    }
    return list.iterator();
}

Source File: NLJoinFunction.java From spliceengine with GNU Affero General Public License v3.0

6 votes

protected void init(Iterator<ExecRow> from) throws StandardException {
    checkInit();
    taskContext = TaskContext.get();
    if (taskContext != null) {
        taskContext.addTaskCompletionListener((TaskCompletionListener) (t) -> close());
    }
    operationContext.getOperation().registerCloseable(this);
    SConfiguration configuration= EngineDriver.driver().getConfiguration();
    batchSize = configuration.getNestedLoopJoinBatchSize();
    nLeftRows = 0;
    leftSideIterator = from;
    executorService = SIDriver.driver().getExecutorService();
    firstBatch = new ArrayDeque<>(batchSize);

    initOperationContexts();
    loadBatch();
}

Source File: ClusterFunctionProvider.java From datacollector with Apache License 2.0

6 votes

public static synchronized ClusterFunction getClusterFunction() throws Exception {
  // Why such a complex name?
  // When an executor dies and a new one takes its place, having just partition id won't work, because the old file
  // might not have been closed by the namenode since the old executor handling that partition may have just died.
  // So we must ensure a truly unique part which is executor id.
  // ---- BUT ----
  // Multiple partitions of the same job can run on the same executor, which is especially true now since we allow
  // the user to set fewer executors than partitions, so we need the partition id.
  // ---- BUT ----
  // Users could end up not making it unique enough, since partition id and executor id are not unique across jobs, so
  // if they use ${sdc:id()} in 2 cluster pipelines with same directory, then it will still collide, so prefix this
  // with pipeline id.
  // ---- DONE, YAY! ----
  if (clusterFunction == null) {
    clusterFunction =
        (ClusterFunction) BootstrapCluster.getClusterFunction(
            BootstrapCluster.getProperties().getProperty(ClusterModeConstants.CLUSTER_PIPELINE_NAME) +
                "-" +
                TaskContext.get().partitionId() + "-" +
                SparkEnv.get().executorId()
        );
  }
  return clusterFunction;
}

Source File: SparkAMDSI.java From deeplearning4j with Apache License 2.0

6 votes

public SparkAMDSI(MultiDataSetIterator iterator, int queueSize, BlockingQueue<MultiDataSet> queue,
                boolean useWorkspace, DataSetCallback callback, Integer deviceId) {
    this();

    if (queueSize < 2)
        queueSize = 2;

    this.callback = callback;
    this.buffer = queue;
    this.backedIterator = iterator;
    this.useWorkspaces = useWorkspace;
    this.prefetchSize = queueSize;
    this.workspaceId = "SAMDSI_ITER-" + java.util.UUID.randomUUID().toString();
    this.deviceId = deviceId;

    if (iterator.resetSupported())
        this.backedIterator.reset();

    this.thread = new SparkPrefetchThread(buffer, iterator, terminator, Nd4j.getAffinityManager().getDeviceForCurrentThread());

    context = TaskContext.get();

    thread.setDaemon(true);
    thread.start();
}

Source File: CompatUtils.java From elasticsearch-hadoop with Apache License 2.0

5 votes

static void addOnCompletition(TaskContext taskContext, final Function0<?> function) {
    taskContext.addTaskCompletionListener(new TaskCompletionListener() {
        @Override
        public void onTaskCompletion(TaskContext context) {
            function.apply();
        }
    });
}

Source File: SourceRDD.java From beam with Apache License 2.0

5 votes

@Override
public scala.collection.Iterator<scala.Tuple2<Source<T>, CheckpointMarkT>> compute(
    Partition split, TaskContext context) {
  @SuppressWarnings("unchecked")
  CheckpointableSourcePartition<T, CheckpointMarkT> partition =
      (CheckpointableSourcePartition<T, CheckpointMarkT>) split;
  scala.Tuple2<Source<T>, CheckpointMarkT> tuple2 =
      new scala.Tuple2<>(partition.getSource(), partition.checkpointMark);
  return JavaConversions.asScalaIterator(Collections.singleton(tuple2).iterator());
}

Source File: HiveWarehouseDataReader.java From spark-llap with Apache License 2.0

5 votes

protected TaskAttemptID getTaskAttemptID(LlapInputSplit split, JobConf conf) throws IOException {
  //Get pseudo-ApplicationId to submit task attempt from external client
  SubmitWorkInfo submitWorkInfo = SubmitWorkInfo.fromBytes(split.getPlanBytes());
  ApplicationId appId = submitWorkInfo.getFakeAppId();
  JobID jobId = new JobID(Long.toString(appId.getClusterTimestamp()), appId.getId());
  //Create TaskAttemptID from Spark TaskContext (TaskType doesn't matter)
  return new TaskAttemptID(new TaskID(jobId, TaskType.MAP, TaskContext.get().partitionId()), TaskContext.get().attemptNumber());
}

Source File: RemoteDPParForSparkWorker.java From systemds with Apache License 2.0

5 votes

@Override 
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0)
	throws Exception 
{
	//lazy parworker initialization
	configureWorker( TaskContext.get().taskAttemptId() );

	//process all matrix partitions of this data partition
	MatrixBlock partition = null;
	while( arg0.hasNext() )
	{
		Tuple2<Long,Iterable<Writable>> larg = arg0.next();
		
		//collect input partition (check via equals because oinfo deserialized instance)
		if( _fmt == FileFormat.BINARY )
			partition = collectBinaryBlock( larg._2(), partition );
		else
			partition = collectBinaryCellInput( larg._2() );
		
		//update in-memory matrix partition
		MatrixObject mo = _ec.getMatrixObject( _inputVar );
		mo.setInMemoryPartition( partition );
		
		//create tasks for input data
		Task lTask = new Task(_iterVar, TaskType.SET);
		lTask.addIteration( new IntObject(larg._1()) );
		
		//execute program
		long numIter = getExecutedIterations();
		super.executeTask( lTask );
		
		//maintain accumulators
		_aTasks.add( 1 );
		_aIters.add( (int)(getExecutedIterations()-numIter) );
	}
	
	//write output if required (matrix indexed write)
	return RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}

Source File: RemoteParForSparkWorker.java From systemds with Apache License 2.0

5 votes

@Override 
public Iterator<Tuple2<Long, String>> call(Task arg0)
	throws Exception 
{
	//lazy parworker initialization
	if( !_initialized )
		configureWorker(TaskContext.get().taskAttemptId());
	
	//keep input var names
	Set<String> inVars = new HashSet<>(_ec.getVariables().keySet());
	
	//execute a single task
	long numIter = getExecutedIterations();
	super.executeTask( arg0 );
	
	//maintain accumulators
	_aTasks.add( 1 );
	_aIters.add( (int)(getExecutedIterations()-numIter) );
	
	//cleanup remaining intermediate variables from buffer pool
	_ec.getVariables().keySet().stream().filter(v -> !inVars.contains(v))
		.map(v -> _ec.getVariable(v)).filter(d -> d instanceof CacheableData)
		.forEach(c -> ((CacheableData<?>)c).freeEvictedBlob());
	
	//write output lineage of required
	if( DMLScript.LINEAGE )
		RemoteParForUtils.exportLineageItems(_workerID, 
			_ec.getVariables(), _resultVars, _ec.getLineage());
	
	//write output if required (matrix indexed write), incl cleanup pinned vars
	//note: this copy is necessary for environments without spark libraries
	return RemoteParForUtils
		.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}

Source File: DeepRDD.java From deep-spark with Apache License 2.0

5 votes

@Override
public Iterator<T> compute(Partition split, TaskContext context) {

    initExtractorClient();

    extractorClient.initIterator(split, config.getValue());

    context.addTaskCompletionListener(new AbstractFunction1<TaskContext, BoxedUnit>() {

        @Override
        public BoxedUnit apply(TaskContext v1) {
            extractorClient.close();
            return null;
        }
    });

    java.util.Iterator<T> iterator = new java.util.Iterator<T>() {

        @Override
        public boolean hasNext() {
            return extractorClient.hasNext();
        }

        @Override
        public T next() {
            return extractorClient.next();
        }

        @Override
        public void remove() {
            throw new DeepIOException(
                    "Method not implemented (and won't be implemented anytime soon!!!)");
        }
    };

    return new InterruptibleIterator<>(context, asScalaIterator(iterator));

}

Source File: SparkADSI.java From deeplearning4j with Apache License 2.0

5 votes

public SparkADSI(DataSetIterator iterator, int queueSize, BlockingQueue<DataSet> queue, boolean useWorkspace,
                DataSetCallback callback, Integer deviceId) {
    this();

    if (queueSize < 2)
        queueSize = 2;

    this.deviceId = deviceId;
    this.callback = callback;
    this.useWorkspace = useWorkspace;
    this.buffer = queue;
    this.prefetchSize = queueSize;
    this.backedIterator = iterator;
    this.workspaceId = "SADSI_ITER-" + java.util.UUID.randomUUID().toString();

    if (iterator.resetSupported())
        this.backedIterator.reset();

    context = TaskContext.get();

    this.thread = new SparkPrefetchThread(buffer, iterator, terminator, null, Nd4j.getAffinityManager().getDeviceForCurrentThread());

    /**
     * We want to ensure, that background thread will have the same thread->device affinity, as master thread
     */

    thread.setDaemon(true);
    thread.start();
}

Source File: RemoteDPParForSparkWorker.java From systemds with Apache License 2.0

5 votes

@Override 
public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0)
	throws Exception 
{
	//lazy parworker initialization
	configureWorker( TaskContext.get().taskAttemptId() );

	//process all matrix partitions of this data partition
	MatrixBlock partition = null;
	while( arg0.hasNext() )
	{
		Tuple2<Long,Iterable<Writable>> larg = arg0.next();
		
		//collect input partition (check via equals because oinfo deserialized instance)
		if( _oinfo.equals(OutputInfo.BinaryBlockOutputInfo) )
			partition = collectBinaryBlock( larg._2(), partition );
		else
			partition = collectBinaryCellInput( larg._2() );
		
		//update in-memory matrix partition
		MatrixObject mo = _ec.getMatrixObject( _inputVar );
		mo.setInMemoryPartition( partition );
		
		//create tasks for input data
		Task lTask = new Task(_iterVar, TaskType.SET);
		lTask.addIteration( new IntObject(larg._1()) );
		
		//execute program
		long numIter = getExecutedIterations();
		super.executeTask( lTask );
		
		//maintain accumulators
		_aTasks.add( 1 );
		_aIters.add( (int)(getExecutedIterations()-numIter) );
	}
	
	//write output if required (matrix indexed write)
	return RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}

Source File: IteratorUtils.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static <E> Iterator<E> asInterruptibleIterator(Iterator<E> it) {
    TaskContext context = TaskContext.get();
    if (context != null) {
        return (Iterator<E>) JavaConverters.asJavaIteratorConverter(new InterruptibleIterator(context, JavaConverters.asScalaIteratorConverter(it).asScala())).asJava();
    } else
        return it;
}

Source File: SparkLeanOperationContext.java From spliceengine with GNU Affero General Public License v3.0

5 votes

@Override
@SuppressFBWarnings(value = "ST_WRITE_TO_STATIC_FROM_INSTANCE_METHOD", justification = "intended")
public void readExternal(ObjectInput in)
        throws IOException, ClassNotFoundException{
    if (in.readBoolean()) {
        SpliceClient.connectionString = in.readUTF();
        SpliceClient.setClient(HConfiguration.getConfiguration().getAuthenticationTokenEnabled(), SpliceClient.Mode.EXECUTOR);
    }
    badRecordsSeen = in.readLong();
    badRecordThreshold = in.readLong();
    permissive=in.readBoolean();
    SpliceSpark.setupSpliceStaticComponents();
    boolean isOp=in.readBoolean();
    if(isOp){
        broadcastedActivation = (BroadcastedActivation)in.readObject();
        ActivationHolder ah = broadcastedActivation.getActivationHolder();
        op=(Op)ah.getOperationsMap().get(in.readInt());
        activation = ah.getActivation();
        TaskContext taskContext = TaskContext.get();
        if (taskContext != null) {
            taskContext.addTaskCompletionListener((TaskCompletionListener)(ctx) -> ah.close());
        }
    }
    badRecordsAccumulator = (Accumulable<BadRecordsRecorder,String>) in.readObject();
    importFileName= (String) in.readObject();
    rowsWritten=(LongAccumulator)in.readObject();
}

Source File: KafkaStreamer.java From spliceengine with GNU Affero General Public License v3.0

5 votes

@Override
public Iterator<String> call(Integer partition, Iterator<T> locatedRowIterator) throws Exception {
    taskContext = TaskContext.get();

    if (taskContext != null && taskContext.attemptNumber() > 0) {
        LOG.trace("KS.c attempts "+taskContext.attemptNumber());
        long entriesInKafka = KafkaUtils.messageCount(bootstrapServers, topicName, partition);
        LOG.trace("KS.c entries "+entriesInKafka);
        for (long i = 0; i < entriesInKafka; ++i) {
            locatedRowIterator.next();
        }
    }

    Properties props = new Properties();
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
    props.put(ProducerConfig.CLIENT_ID_CONFIG, "spark-producer-dss-ks-"+UUID.randomUUID() );
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, IntegerSerializer.class.getName());
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ExternalizableSerializer.class.getName());
    KafkaProducer<Integer, Externalizable> producer = new KafkaProducer<>(props);
    int count = 0 ;
    while (locatedRowIterator.hasNext()) {
        T lr = locatedRowIterator.next();

        ProducerRecord<Integer, Externalizable> record = new ProducerRecord(topicName, count++, lr);
        producer.send(record);
        LOG.trace("KS.c sent "+partition.intValue()+" "+count+" "+lr);
    }
    LOG.trace("KS.c count "+partition.intValue()+" "+count);

    producer.close();
    // TODO Clean up
    return Arrays.asList("OK").iterator();
}

Source File: SourceRDD.java From beam with Apache License 2.0

5 votes

@Override
public scala.collection.Iterator<WindowedValue<T>> compute(
    final Partition split, final TaskContext context) {
  final MetricsContainer metricsContainer = metricsAccum.value().getContainer(stepName);

  @SuppressWarnings("unchecked")
  final BoundedSource.BoundedReader<T> reader = createReader((SourcePartition<T>) split);

  final Iterator<WindowedValue<T>> readerIterator =
      new ReaderToIteratorAdapter<>(metricsContainer, reader);

  return new InterruptibleIterator<>(context, JavaConversions.asScalaIterator(readerIterator));
}

Source File: RemoteParForSparkWorker.java From systemds with Apache License 2.0

5 votes

@Override 
public Iterator<Tuple2<Long, String>> call(Task arg0)
	throws Exception 
{
	//lazy parworker initialization
	if( !_initialized )
		configureWorker(TaskContext.get().taskAttemptId());
	
	//keep input var names
	Set<String> inVars = new HashSet<>(_ec.getVariables().keySet());
	
	//execute a single task
	long numIter = getExecutedIterations();
	super.executeTask( arg0 );
	
	//maintain accumulators
	_aTasks.add( 1 );
	_aIters.add( (int)(getExecutedIterations()-numIter) );
	
	//cleanup remaining intermediate variables from buffer pool
	_ec.getVariables().keySet().stream().filter(v -> !inVars.contains(v))
		.map(v -> _ec.getVariable(v)).filter(d -> d instanceof CacheableData)
		.forEach(c -> ((CacheableData<?>)c).freeEvictedBlob());
	
	//write output lineage of required
	if( DMLScript.LINEAGE )
		RemoteParForUtils.exportLineageItems(_workerID, 
			_ec.getVariables(), _resultVars, _ec.getLineage());
	
	//write output if required (matrix indexed write), incl cleanup pinned vars
	//note: this copy is necessary for environments without spark libraries
	return RemoteParForUtils
		.exportResultVariables(_workerID, _ec.getVariables(), _resultVars)
		.stream().map(s -> new Tuple2<>(_workerID, s)).iterator();
}

Source File: ExpKeyFilenameMap.java From incubator-retired-pirk with Apache License 2.0

5 votes

@Override
public Iterator<Tuple2<Integer, String>> call(Iterator<Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>>> iter) throws Exception
{
  List<Tuple2<Integer,String>> keyFileList = new ArrayList<>();

  FileSystem fs = FileSystem.get(new Configuration());

  // Form the filename for the exp table portion that corresponds to this partition
  int taskId = TaskContext.getPartitionId();
  logger.info("taskId = " + taskId);

  String fileName = expOutDir + "/exp-" + String.format("%05d", taskId);
  logger.info("fileName = " + fileName);

  // Iterate over the elements of the partition
  BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(fileName), true)));
  while (iter.hasNext())
  {
    // <queryHash, <<power>,<element^power mod N^2>>
    Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>> expTuple = iter.next();
    int queryHash = expTuple._1;

    // Record the queryHash -> fileName
    keyFileList.add(new Tuple2<>(queryHash, fileName));

    // Write the partition elements to the corresponding exp table file
    // each line: queryHash,<power>-<element^power mod N^2>
    for (Tuple2<Integer,BigInteger> modPow : expTuple._2)
    {
      String lineOut = queryHash + "," + modPow._1 + "-" + modPow._2;
      bw.write(lineOut);
      bw.newLine();
    }
  }
  bw.close();

  return keyFileList.iterator();
}

Source File: MizoRDD.java From mizo with Apache License 2.0

5 votes

@Override
public scala.collection.Iterator<TReturn> compute(Partition split, TaskContext context) {
    String regionEdgesFamilyPath = this.regionsPaths.get(split.index());
    log.info("Running Mizo on region #{} located at: {}", split.index(), regionEdgesFamilyPath);

    return createRegionIterator(createRegionRelationsIterator(regionEdgesFamilyPath));
}

Source File: SMRecordReaderImpl.java From spliceengine with GNU Affero General Public License v3.0

4 votes

@Override
public void onTaskFailure(TaskContext context, Throwable error) {
	LOG.error("Task failed for split: " + split, error);
}

Source File: SMRecordReaderImpl.java From spliceengine with GNU Affero General Public License v3.0

4 votes

public void init(Configuration config, InputSplit split) throws IOException, InterruptedException {	
	if (LOG.isDebugEnabled())
		SpliceLogUtils.debug(LOG, "init");
	if (TaskContext.get() != null) {
		TaskContext.get().addTaskFailureListener(this);
	}
	String tableScannerAsString = config.get(MRConstants.SPLICE_SCAN_INFO);
       if (tableScannerAsString == null)
		throw new IOException("splice scan info was not serialized to task, failing");
	byte[] scanStartKey = null;
	byte[] scanStopKey = null;
	try {
		builder = TableScannerBuilder.getTableScannerBuilderFromBase64String(tableScannerAsString);
		if (LOG.isTraceEnabled())
			SpliceLogUtils.trace(LOG, "config loaded builder=%s", builder);
		TableSplit tSplit = ((SMSplit) split).getSplit();
		token = builder.getToken();
		DataScan scan = builder.getScan();
		scanStartKey = scan.getStartKey();
		scanStopKey = scan.getStopKey();
		if (Bytes.startComparator.compare(scanStartKey, tSplit.getStartRow()) < 0) {
			// the split itself is more restrictive
			scan.startKey(tSplit.getStartRow());
		}
		if (Bytes.endComparator.compare(scanStopKey, tSplit.getEndRow()) > 0) {
			// the split itself is more restrictive
			scan.stopKey(tSplit.getEndRow());
		}
		setScan(((HScan) scan).unwrapDelegate());
		// TODO (wjk): this seems weird (added with DB-4483)
		this.statisticsRun = AbstractSMInputFormat.oneSplitPerRegion(config);
		Double sampling = AbstractSMInputFormat.sampling(config);
		if (sampling != null) {
			this.sampling = true;
			this.samplingRate = sampling;
		}
		restart(scan.getStartKey());
	} catch (IOException ioe) {
		LOG.error(String.format("Received exception with scan %s, original start key %s, original stop key %s, split %s",
				scan, Bytes.toStringBinary(scanStartKey), Bytes.toStringBinary(scanStopKey), split), ioe);
		throw ioe;
       } catch (StandardException e) {
		throw new IOException(e);
	}
}

Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}

Source File: KafkaReadFunction.java From spliceengine with GNU Affero General Public License v3.0

4 votes

@Override
public Iterator<ExecRow> call(Integer partition) throws Exception {
    Properties props = new Properties();

    props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);

    String consumer_id = "spark-consumer-dss-krf-"+UUID.randomUUID();
    props.put(ConsumerConfig.GROUP_ID_CONFIG, consumer_id);
    props.put(ConsumerConfig.CLIENT_ID_CONFIG, consumer_id);

    props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, IntegerDeserializer.class.getName());
    props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ExternalizableDeserializer.class.getName());
    props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");

    KafkaConsumer<Integer, Externalizable> consumer = new KafkaConsumer<Integer, Externalizable>(props);
    consumer.assign(Arrays.asList(new TopicPartition(topicName, partition)));

    return new Iterator<ExecRow>() {
        Iterator<ConsumerRecord<Integer, Externalizable>> it = null;

        @Override
        public boolean hasNext() {
            if (it == null) {
                ConsumerRecords<Integer, Externalizable> records = null;
                while (records == null || records.isEmpty()) {
                    records = consumer.poll( java.time.Duration.ofMillis(1000) );
                    if (TaskContext.get().isInterrupted()) {
                        consumer.close();
                        throw new TaskKilledException();
                    }
                }
                it = records.iterator();
            }
            if (it.hasNext()) {
                return true;
            }
            else {
                consumer.close();
                return false;
            }
        }

        @Override
        public ExecRow next() {
            return (ExecRow)it.next().value();
        }
    };
}

Source File: SparkJavaRDD.java From incubator-nemo with Apache License 2.0

4 votes

@Override
public Iterator<T> iterator(final Partition split, final TaskContext taskContext) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Source File: RowDataRewriter.java From iceberg with Apache License 2.0

4 votes

private TaskResult rewriteDataForTask(CombinedScanTask task) throws Exception {
  TaskContext context = TaskContext.get();
  int partitionId = context.partitionId();
  long taskId = context.taskAttemptId();

  RowDataReader dataReader = new RowDataReader(
      task, schema, schema, nameMapping, io.value(), encryptionManager.value(), caseSensitive);

  SparkAppenderFactory appenderFactory = new SparkAppenderFactory(
      properties, schema, SparkSchemaUtil.convert(schema));
  OutputFileFactory fileFactory = new OutputFileFactory(
      spec, format, locations, io.value(), encryptionManager.value(), partitionId, taskId);

  BaseWriter writer;
  if (spec.fields().isEmpty()) {
    writer = new UnpartitionedWriter(spec, format, appenderFactory, fileFactory, io.value(), Long.MAX_VALUE);
  } else {
    writer = new PartitionedWriter(spec, format, appenderFactory, fileFactory, io.value(), Long.MAX_VALUE, schema);
  }

  try {
    while (dataReader.next()) {
      InternalRow row = dataReader.get();
      writer.write(row);
    }

    dataReader.close();
    dataReader = null;
    return writer.complete();

  } catch (Throwable originalThrowable) {
    try {
      LOG.error("Aborting task", originalThrowable);
      context.markTaskFailed(originalThrowable);

      LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})",
          partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber());
      if (dataReader != null) {
        dataReader.close();
      }
      writer.abort();
      LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})",
          partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber());

    } catch (Throwable inner) {
      if (originalThrowable != inner) {
        originalThrowable.addSuppressed(inner);
        LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner);
      }
    }

    if (originalThrowable instanceof Exception) {
      throw originalThrowable;
    } else {
      throw new RuntimeException(originalThrowable);
    }
  }
}

Source File: VideoStreamProcessor.java From video-stream-classification with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
//Read properties
Properties prop = PropertyFileReader.readPropertyFile();

//SparkSesion
SparkSession spark = SparkSession
	      .builder()
	      .appName("VideoStreamProcessor")
	      .master(prop.getProperty("spark.master.url"))
	      .getOrCreate();	

//directory to save image files with motion detected
final String processedImageDir = prop.getProperty("processed.output.dir");
logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file.");

//create schema for json message
StructType schema =  DataTypes.createStructType(new StructField[] { 
		DataTypes.createStructField("cameraId", DataTypes.StringType, true),
		DataTypes.createStructField("timestamp", DataTypes.TimestampType, true),
		DataTypes.createStructField("rows", DataTypes.IntegerType, true),
		DataTypes.createStructField("cols", DataTypes.IntegerType, true),
		DataTypes.createStructField("type", DataTypes.IntegerType, true),
		DataTypes.createStructField("data", DataTypes.StringType, true)
		});


//Create DataSet from stream messages from kafka
   Dataset<VideoEventData> ds = spark
     .readStream()
     .format("kafka")
     .option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers"))
     .option("subscribe", prop.getProperty("kafka.topic"))
     .option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes"))
     .option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records"))
     .load()
     .selectExpr("CAST(value AS STRING) as message")
     .select(functions.from_json(functions.col("message"),schema).as("json"))
     .select("json.*")
     .as(Encoders.bean(VideoEventData.class)); 
   
   //key-value pair of cameraId-VideoEventData
KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() {
	@Override
	public String call(VideoEventData value) throws Exception {
		return value.getCameraId();
	}
}, Encoders.STRING());
	
//process
Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){
	@Override
	public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception {
		logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId());
		VideoEventData existing = null;
		//check previous state
		if (state.exists()) {
			existing = state.get();
		}
		//classify image
		VideoEventData processed = ImageProcessor.process(key,values,processedImageDir,existing);
		
		//update last processed
		if(processed != null){
			state.update(processed);
		}
		return processed;
	}}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class));

//start
 StreamingQuery query = processedDataset.writeStream()
	      .outputMode("update")
	      .format("console")
	      .start();
 
 //await
    query.awaitTermination();
}

org.apache.spark.TaskContext Java Examples