org.apache.spark.TaskContext Java Examples
The following examples show how to use
org.apache.spark.TaskContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCopyOnWriteActionExecutor.java From hudi with Apache License 2.0 | 6 votes |
@Test public void testMakeNewPath() throws Exception { String fileName = UUID.randomUUID().toString(); String partitionPath = "2016/05/04"; String instantTime = HoodieTestUtils.makeNewCommitTime(); HoodieWriteConfig config = makeHoodieClientConfig(); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieTable.create(metaClient, config, hadoopConf); Pair<Path, String> newPathWithWriteToken = jsc.parallelize(Arrays.asList(1)).map(x -> { HoodieRecord record = mock(HoodieRecord.class); when(record.getPartitionPath()).thenReturn(partitionPath); String writeToken = FSUtils.makeWriteToken(TaskContext.getPartitionId(), TaskContext.get().stageId(), TaskContext.get().taskAttemptId()); HoodieCreateHandle io = new HoodieCreateHandle(config, instantTime, table, partitionPath, fileName, supplier); return Pair.of(io.makeNewPath(record.getPartitionPath()), writeToken); }).collect().get(0); assertEquals(newPathWithWriteToken.getKey().toString(), Paths.get(this.basePath, partitionPath, FSUtils.makeDataFileName(instantTime, newPathWithWriteToken.getRight(), fileName)).toString()); }
Example #2
Source File: SpliceOutputCommitter.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Override public void setupTask(TaskAttemptContext taskContext) throws IOException { if (LOG.isDebugEnabled()) SpliceLogUtils.debug(LOG,"setupTask"); // Create child additive transaction so we don't read rows inserted by ourselves in this operation TaskContext sparkTaskContext = TaskContext.get(); TaskId taskId = null; if (sparkTaskContext != null) { int stageId = sparkTaskContext.stageId(); int partitionId = sparkTaskContext.partitionId(); int attemptNumber = sparkTaskContext.attemptNumber(); taskId = new TaskId(stageId, partitionId, attemptNumber); } TxnView txn = SIDriver.driver().lifecycleManager().beginChildTransaction(parentTxn, parentTxn.getIsolationLevel(), true, destinationTable, false, taskId); ActiveWriteTxn childTxn = new ActiveWriteTxn(txn.getTxnId(), txn.getTxnId(), parentTxn, true, parentTxn.getIsolationLevel(), taskId); currentTxn.set(childTxn); if (LOG.isDebugEnabled()) SpliceLogUtils.debug(LOG,"beginTxn=%s and destinationTable=%s",childTxn,destinationTable); }
Example #3
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 6 votes |
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec, String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile); try (ManifestWriter<DataFile> writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); return ImmutableList.of(manifestFile).iterator(); } else { return Collections.emptyIterator(); } }
Example #4
Source File: StreamNodeLoader.java From sylph with Apache License 2.0 | 6 votes |
private static Sink<JavaRDD<Row>> loadRealTimeSink(RealTimeSink realTimeSink) { return (Sink<JavaRDD<Row>>) rdd -> rdd.foreachPartition(partition -> { Throwable errorOrNull = null; try { int partitionId = TaskContext.getPartitionId(); boolean openOK = realTimeSink.open(partitionId, 0); //初始化 返回是否正常 如果正常才处理数据 if (openOK) { partition.forEachRemaining(row -> realTimeSink.process(SparkRecord.make(row))); } } catch (Exception e) { errorOrNull = e; //open出错了 } finally { realTimeSink.close(errorOrNull); //destroy() } }); }
Example #5
Source File: StreamNodeLoader.java From sylph with Apache License 2.0 | 6 votes |
public static Iterator<Row> transFunction(Iterator<Row> partition, RealTimeTransForm realTimeTransForm) { Exception errorOrNull = null; Schema schema = realTimeTransForm.getSchema(); // if not null List<Row> list = new ArrayList<>(); try { int partitionId = TaskContext.getPartitionId(); if (realTimeTransForm.open(partitionId, 0)) { partition.forEachRemaining(row -> { realTimeTransForm.process(SparkRecord.make(row), (transOutrow) -> { //TODO: SparkRow.parserRow(x) with schema ? list.add(SparkRecord.parserRow(transOutrow)); }); }); } } catch (Exception e) { errorOrNull = e; //转换失败 这批数据都丢弃 } finally { realTimeTransForm.close(errorOrNull); //destroy() } return list.iterator(); }
Example #6
Source File: NLJoinFunction.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
protected void init(Iterator<ExecRow> from) throws StandardException { checkInit(); taskContext = TaskContext.get(); if (taskContext != null) { taskContext.addTaskCompletionListener((TaskCompletionListener) (t) -> close()); } operationContext.getOperation().registerCloseable(this); SConfiguration configuration= EngineDriver.driver().getConfiguration(); batchSize = configuration.getNestedLoopJoinBatchSize(); nLeftRows = 0; leftSideIterator = from; executorService = SIDriver.driver().getExecutorService(); firstBatch = new ArrayDeque<>(batchSize); initOperationContexts(); loadBatch(); }
Example #7
Source File: ClusterFunctionProvider.java From datacollector with Apache License 2.0 | 6 votes |
public static synchronized ClusterFunction getClusterFunction() throws Exception { // Why such a complex name? // When an executor dies and a new one takes its place, having just partition id won't work, because the old file // might not have been closed by the namenode since the old executor handling that partition may have just died. // So we must ensure a truly unique part which is executor id. // ---- BUT ---- // Multiple partitions of the same job can run on the same executor, which is especially true now since we allow // the user to set fewer executors than partitions, so we need the partition id. // ---- BUT ---- // Users could end up not making it unique enough, since partition id and executor id are not unique across jobs, so // if they use ${sdc:id()} in 2 cluster pipelines with same directory, then it will still collide, so prefix this // with pipeline id. // ---- DONE, YAY! ---- if (clusterFunction == null) { clusterFunction = (ClusterFunction) BootstrapCluster.getClusterFunction( BootstrapCluster.getProperties().getProperty(ClusterModeConstants.CLUSTER_PIPELINE_NAME) + "-" + TaskContext.get().partitionId() + "-" + SparkEnv.get().executorId() ); } return clusterFunction; }
Example #8
Source File: SparkAMDSI.java From deeplearning4j with Apache License 2.0 | 6 votes |
public SparkAMDSI(MultiDataSetIterator iterator, int queueSize, BlockingQueue<MultiDataSet> queue, boolean useWorkspace, DataSetCallback callback, Integer deviceId) { this(); if (queueSize < 2) queueSize = 2; this.callback = callback; this.buffer = queue; this.backedIterator = iterator; this.useWorkspaces = useWorkspace; this.prefetchSize = queueSize; this.workspaceId = "SAMDSI_ITER-" + java.util.UUID.randomUUID().toString(); this.deviceId = deviceId; if (iterator.resetSupported()) this.backedIterator.reset(); this.thread = new SparkPrefetchThread(buffer, iterator, terminator, Nd4j.getAffinityManager().getDeviceForCurrentThread()); context = TaskContext.get(); thread.setDaemon(true); thread.start(); }
Example #9
Source File: CompatUtils.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
static void addOnCompletition(TaskContext taskContext, final Function0<?> function) { taskContext.addTaskCompletionListener(new TaskCompletionListener() { @Override public void onTaskCompletion(TaskContext context) { function.apply(); } }); }
Example #10
Source File: SourceRDD.java From beam with Apache License 2.0 | 5 votes |
@Override public scala.collection.Iterator<scala.Tuple2<Source<T>, CheckpointMarkT>> compute( Partition split, TaskContext context) { @SuppressWarnings("unchecked") CheckpointableSourcePartition<T, CheckpointMarkT> partition = (CheckpointableSourcePartition<T, CheckpointMarkT>) split; scala.Tuple2<Source<T>, CheckpointMarkT> tuple2 = new scala.Tuple2<>(partition.getSource(), partition.checkpointMark); return JavaConversions.asScalaIterator(Collections.singleton(tuple2).iterator()); }
Example #11
Source File: HiveWarehouseDataReader.java From spark-llap with Apache License 2.0 | 5 votes |
protected TaskAttemptID getTaskAttemptID(LlapInputSplit split, JobConf conf) throws IOException { //Get pseudo-ApplicationId to submit task attempt from external client SubmitWorkInfo submitWorkInfo = SubmitWorkInfo.fromBytes(split.getPlanBytes()); ApplicationId appId = submitWorkInfo.getFakeAppId(); JobID jobId = new JobID(Long.toString(appId.getClusterTimestamp()), appId.getId()); //Create TaskAttemptID from Spark TaskContext (TaskType doesn't matter) return new TaskAttemptID(new TaskID(jobId, TaskType.MAP, TaskContext.get().partitionId()), TaskContext.get().attemptNumber()); }
Example #12
Source File: RemoteDPParForSparkWorker.java From systemds with Apache License 2.0 | 5 votes |
@Override public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0) throws Exception { //lazy parworker initialization configureWorker( TaskContext.get().taskAttemptId() ); //process all matrix partitions of this data partition MatrixBlock partition = null; while( arg0.hasNext() ) { Tuple2<Long,Iterable<Writable>> larg = arg0.next(); //collect input partition (check via equals because oinfo deserialized instance) if( _fmt == FileFormat.BINARY ) partition = collectBinaryBlock( larg._2(), partition ); else partition = collectBinaryCellInput( larg._2() ); //update in-memory matrix partition MatrixObject mo = _ec.getMatrixObject( _inputVar ); mo.setInMemoryPartition( partition ); //create tasks for input data Task lTask = new Task(_iterVar, TaskType.SET); lTask.addIteration( new IntObject(larg._1()) ); //execute program long numIter = getExecutedIterations(); super.executeTask( lTask ); //maintain accumulators _aTasks.add( 1 ); _aIters.add( (int)(getExecutedIterations()-numIter) ); } //write output if required (matrix indexed write) return RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars) .stream().map(s -> new Tuple2<>(_workerID, s)).iterator(); }
Example #13
Source File: RemoteParForSparkWorker.java From systemds with Apache License 2.0 | 5 votes |
@Override public Iterator<Tuple2<Long, String>> call(Task arg0) throws Exception { //lazy parworker initialization if( !_initialized ) configureWorker(TaskContext.get().taskAttemptId()); //keep input var names Set<String> inVars = new HashSet<>(_ec.getVariables().keySet()); //execute a single task long numIter = getExecutedIterations(); super.executeTask( arg0 ); //maintain accumulators _aTasks.add( 1 ); _aIters.add( (int)(getExecutedIterations()-numIter) ); //cleanup remaining intermediate variables from buffer pool _ec.getVariables().keySet().stream().filter(v -> !inVars.contains(v)) .map(v -> _ec.getVariable(v)).filter(d -> d instanceof CacheableData) .forEach(c -> ((CacheableData<?>)c).freeEvictedBlob()); //write output lineage of required if( DMLScript.LINEAGE ) RemoteParForUtils.exportLineageItems(_workerID, _ec.getVariables(), _resultVars, _ec.getLineage()); //write output if required (matrix indexed write), incl cleanup pinned vars //note: this copy is necessary for environments without spark libraries return RemoteParForUtils .exportResultVariables(_workerID, _ec.getVariables(), _resultVars) .stream().map(s -> new Tuple2<>(_workerID, s)).iterator(); }
Example #14
Source File: DeepRDD.java From deep-spark with Apache License 2.0 | 5 votes |
@Override public Iterator<T> compute(Partition split, TaskContext context) { initExtractorClient(); extractorClient.initIterator(split, config.getValue()); context.addTaskCompletionListener(new AbstractFunction1<TaskContext, BoxedUnit>() { @Override public BoxedUnit apply(TaskContext v1) { extractorClient.close(); return null; } }); java.util.Iterator<T> iterator = new java.util.Iterator<T>() { @Override public boolean hasNext() { return extractorClient.hasNext(); } @Override public T next() { return extractorClient.next(); } @Override public void remove() { throw new DeepIOException( "Method not implemented (and won't be implemented anytime soon!!!)"); } }; return new InterruptibleIterator<>(context, asScalaIterator(iterator)); }
Example #15
Source File: SparkADSI.java From deeplearning4j with Apache License 2.0 | 5 votes |
public SparkADSI(DataSetIterator iterator, int queueSize, BlockingQueue<DataSet> queue, boolean useWorkspace, DataSetCallback callback, Integer deviceId) { this(); if (queueSize < 2) queueSize = 2; this.deviceId = deviceId; this.callback = callback; this.useWorkspace = useWorkspace; this.buffer = queue; this.prefetchSize = queueSize; this.backedIterator = iterator; this.workspaceId = "SADSI_ITER-" + java.util.UUID.randomUUID().toString(); if (iterator.resetSupported()) this.backedIterator.reset(); context = TaskContext.get(); this.thread = new SparkPrefetchThread(buffer, iterator, terminator, null, Nd4j.getAffinityManager().getDeviceForCurrentThread()); /** * We want to ensure, that background thread will have the same thread->device affinity, as master thread */ thread.setDaemon(true); thread.start(); }
Example #16
Source File: RemoteDPParForSparkWorker.java From systemds with Apache License 2.0 | 5 votes |
@Override public Iterator<Tuple2<Long, String>> call(Iterator<Tuple2<Long, Iterable<Writable>>> arg0) throws Exception { //lazy parworker initialization configureWorker( TaskContext.get().taskAttemptId() ); //process all matrix partitions of this data partition MatrixBlock partition = null; while( arg0.hasNext() ) { Tuple2<Long,Iterable<Writable>> larg = arg0.next(); //collect input partition (check via equals because oinfo deserialized instance) if( _oinfo.equals(OutputInfo.BinaryBlockOutputInfo) ) partition = collectBinaryBlock( larg._2(), partition ); else partition = collectBinaryCellInput( larg._2() ); //update in-memory matrix partition MatrixObject mo = _ec.getMatrixObject( _inputVar ); mo.setInMemoryPartition( partition ); //create tasks for input data Task lTask = new Task(_iterVar, TaskType.SET); lTask.addIteration( new IntObject(larg._1()) ); //execute program long numIter = getExecutedIterations(); super.executeTask( lTask ); //maintain accumulators _aTasks.add( 1 ); _aIters.add( (int)(getExecutedIterations()-numIter) ); } //write output if required (matrix indexed write) return RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars) .stream().map(s -> new Tuple2<>(_workerID, s)).iterator(); }
Example #17
Source File: IteratorUtils.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static <E> Iterator<E> asInterruptibleIterator(Iterator<E> it) { TaskContext context = TaskContext.get(); if (context != null) { return (Iterator<E>) JavaConverters.asJavaIteratorConverter(new InterruptibleIterator(context, JavaConverters.asScalaIteratorConverter(it).asScala())).asJava(); } else return it; }
Example #18
Source File: SparkLeanOperationContext.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override @SuppressFBWarnings(value = "ST_WRITE_TO_STATIC_FROM_INSTANCE_METHOD", justification = "intended") public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException{ if (in.readBoolean()) { SpliceClient.connectionString = in.readUTF(); SpliceClient.setClient(HConfiguration.getConfiguration().getAuthenticationTokenEnabled(), SpliceClient.Mode.EXECUTOR); } badRecordsSeen = in.readLong(); badRecordThreshold = in.readLong(); permissive=in.readBoolean(); SpliceSpark.setupSpliceStaticComponents(); boolean isOp=in.readBoolean(); if(isOp){ broadcastedActivation = (BroadcastedActivation)in.readObject(); ActivationHolder ah = broadcastedActivation.getActivationHolder(); op=(Op)ah.getOperationsMap().get(in.readInt()); activation = ah.getActivation(); TaskContext taskContext = TaskContext.get(); if (taskContext != null) { taskContext.addTaskCompletionListener((TaskCompletionListener)(ctx) -> ah.close()); } } badRecordsAccumulator = (Accumulable<BadRecordsRecorder,String>) in.readObject(); importFileName= (String) in.readObject(); rowsWritten=(LongAccumulator)in.readObject(); }
Example #19
Source File: KafkaStreamer.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override public Iterator<String> call(Integer partition, Iterator<T> locatedRowIterator) throws Exception { taskContext = TaskContext.get(); if (taskContext != null && taskContext.attemptNumber() > 0) { LOG.trace("KS.c attempts "+taskContext.attemptNumber()); long entriesInKafka = KafkaUtils.messageCount(bootstrapServers, topicName, partition); LOG.trace("KS.c entries "+entriesInKafka); for (long i = 0; i < entriesInKafka; ++i) { locatedRowIterator.next(); } } Properties props = new Properties(); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); props.put(ProducerConfig.CLIENT_ID_CONFIG, "spark-producer-dss-ks-"+UUID.randomUUID() ); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, IntegerSerializer.class.getName()); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ExternalizableSerializer.class.getName()); KafkaProducer<Integer, Externalizable> producer = new KafkaProducer<>(props); int count = 0 ; while (locatedRowIterator.hasNext()) { T lr = locatedRowIterator.next(); ProducerRecord<Integer, Externalizable> record = new ProducerRecord(topicName, count++, lr); producer.send(record); LOG.trace("KS.c sent "+partition.intValue()+" "+count+" "+lr); } LOG.trace("KS.c count "+partition.intValue()+" "+count); producer.close(); // TODO Clean up return Arrays.asList("OK").iterator(); }
Example #20
Source File: SourceRDD.java From beam with Apache License 2.0 | 5 votes |
@Override public scala.collection.Iterator<WindowedValue<T>> compute( final Partition split, final TaskContext context) { final MetricsContainer metricsContainer = metricsAccum.value().getContainer(stepName); @SuppressWarnings("unchecked") final BoundedSource.BoundedReader<T> reader = createReader((SourcePartition<T>) split); final Iterator<WindowedValue<T>> readerIterator = new ReaderToIteratorAdapter<>(metricsContainer, reader); return new InterruptibleIterator<>(context, JavaConversions.asScalaIterator(readerIterator)); }
Example #21
Source File: RemoteParForSparkWorker.java From systemds with Apache License 2.0 | 5 votes |
@Override public Iterator<Tuple2<Long, String>> call(Task arg0) throws Exception { //lazy parworker initialization if( !_initialized ) configureWorker(TaskContext.get().taskAttemptId()); //keep input var names Set<String> inVars = new HashSet<>(_ec.getVariables().keySet()); //execute a single task long numIter = getExecutedIterations(); super.executeTask( arg0 ); //maintain accumulators _aTasks.add( 1 ); _aIters.add( (int)(getExecutedIterations()-numIter) ); //cleanup remaining intermediate variables from buffer pool _ec.getVariables().keySet().stream().filter(v -> !inVars.contains(v)) .map(v -> _ec.getVariable(v)).filter(d -> d instanceof CacheableData) .forEach(c -> ((CacheableData<?>)c).freeEvictedBlob()); //write output lineage of required if( DMLScript.LINEAGE ) RemoteParForUtils.exportLineageItems(_workerID, _ec.getVariables(), _resultVars, _ec.getLineage()); //write output if required (matrix indexed write), incl cleanup pinned vars //note: this copy is necessary for environments without spark libraries return RemoteParForUtils .exportResultVariables(_workerID, _ec.getVariables(), _resultVars) .stream().map(s -> new Tuple2<>(_workerID, s)).iterator(); }
Example #22
Source File: ExpKeyFilenameMap.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
@Override public Iterator<Tuple2<Integer, String>> call(Iterator<Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>>> iter) throws Exception { List<Tuple2<Integer,String>> keyFileList = new ArrayList<>(); FileSystem fs = FileSystem.get(new Configuration()); // Form the filename for the exp table portion that corresponds to this partition int taskId = TaskContext.getPartitionId(); logger.info("taskId = " + taskId); String fileName = expOutDir + "/exp-" + String.format("%05d", taskId); logger.info("fileName = " + fileName); // Iterate over the elements of the partition BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(fileName), true))); while (iter.hasNext()) { // <queryHash, <<power>,<element^power mod N^2>> Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>> expTuple = iter.next(); int queryHash = expTuple._1; // Record the queryHash -> fileName keyFileList.add(new Tuple2<>(queryHash, fileName)); // Write the partition elements to the corresponding exp table file // each line: queryHash,<power>-<element^power mod N^2> for (Tuple2<Integer,BigInteger> modPow : expTuple._2) { String lineOut = queryHash + "," + modPow._1 + "-" + modPow._2; bw.write(lineOut); bw.newLine(); } } bw.close(); return keyFileList.iterator(); }
Example #23
Source File: MizoRDD.java From mizo with Apache License 2.0 | 5 votes |
@Override public scala.collection.Iterator<TReturn> compute(Partition split, TaskContext context) { String regionEdgesFamilyPath = this.regionsPaths.get(split.index()); log.info("Running Mizo on region #{} located at: {}", split.index(), regionEdgesFamilyPath); return createRegionIterator(createRegionRelationsIterator(regionEdgesFamilyPath)); }
Example #24
Source File: SMRecordReaderImpl.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
@Override public void onTaskFailure(TaskContext context, Throwable error) { LOG.error("Task failed for split: " + split, error); }
Example #25
Source File: SMRecordReaderImpl.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
public void init(Configuration config, InputSplit split) throws IOException, InterruptedException { if (LOG.isDebugEnabled()) SpliceLogUtils.debug(LOG, "init"); if (TaskContext.get() != null) { TaskContext.get().addTaskFailureListener(this); } String tableScannerAsString = config.get(MRConstants.SPLICE_SCAN_INFO); if (tableScannerAsString == null) throw new IOException("splice scan info was not serialized to task, failing"); byte[] scanStartKey = null; byte[] scanStopKey = null; try { builder = TableScannerBuilder.getTableScannerBuilderFromBase64String(tableScannerAsString); if (LOG.isTraceEnabled()) SpliceLogUtils.trace(LOG, "config loaded builder=%s", builder); TableSplit tSplit = ((SMSplit) split).getSplit(); token = builder.getToken(); DataScan scan = builder.getScan(); scanStartKey = scan.getStartKey(); scanStopKey = scan.getStopKey(); if (Bytes.startComparator.compare(scanStartKey, tSplit.getStartRow()) < 0) { // the split itself is more restrictive scan.startKey(tSplit.getStartRow()); } if (Bytes.endComparator.compare(scanStopKey, tSplit.getEndRow()) > 0) { // the split itself is more restrictive scan.stopKey(tSplit.getEndRow()); } setScan(((HScan) scan).unwrapDelegate()); // TODO (wjk): this seems weird (added with DB-4483) this.statisticsRun = AbstractSMInputFormat.oneSplitPerRegion(config); Double sampling = AbstractSMInputFormat.sampling(config); if (sampling != null) { this.sampling = true; this.samplingRate = sampling; } restart(scan.getStartKey()); } catch (IOException ioe) { LOG.error(String.format("Received exception with scan %s, original start key %s, original stop key %s, split %s", scan, Bytes.toStringBinary(scanStartKey), Bytes.toStringBinary(scanStopKey), split), ioe); throw ioe; } catch (StandardException e) { throw new IOException(e); } }
Example #26
Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
private void init() throws IOException { taskId = TaskContext.getPartitionId(); kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kConfig)) { CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName); cubeDesc = cubeInstance.getDescriptor(); cubeConfig = cubeInstance.getConfig(); reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance); result = Lists.newArrayList(); if (reducerMapping.isCuboidRowCounterReducer(taskId)) { // hll isStatistics = true; baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId(); baseCuboidRowCountInMappers = Lists.newArrayList(); cuboidHLLMap = Maps.newHashMap(); logger.info("Partition {} handling stats", taskId); } else { // normal col col = reducerMapping.getColForReducer(taskId); Preconditions.checkNotNull(col); isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare(); isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col); // local build dict buildDictInReducer = kConfig.isBuildDictInReducerEnabled(); if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder buildDictInReducer = false; } if (reducerMapping.getReducerNumForDimCol(col) > 1) { buildDictInReducer = false; // only works if this is the only reducer of a dictionary column } if (buildDictInReducer) { builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer); } initialized = true; } }
Example #27
Source File: KafkaReadFunction.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
@Override public Iterator<ExecRow> call(Integer partition) throws Exception { Properties props = new Properties(); props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); String consumer_id = "spark-consumer-dss-krf-"+UUID.randomUUID(); props.put(ConsumerConfig.GROUP_ID_CONFIG, consumer_id); props.put(ConsumerConfig.CLIENT_ID_CONFIG, consumer_id); props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, IntegerDeserializer.class.getName()); props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ExternalizableDeserializer.class.getName()); props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); KafkaConsumer<Integer, Externalizable> consumer = new KafkaConsumer<Integer, Externalizable>(props); consumer.assign(Arrays.asList(new TopicPartition(topicName, partition))); return new Iterator<ExecRow>() { Iterator<ConsumerRecord<Integer, Externalizable>> it = null; @Override public boolean hasNext() { if (it == null) { ConsumerRecords<Integer, Externalizable> records = null; while (records == null || records.isEmpty()) { records = consumer.poll( java.time.Duration.ofMillis(1000) ); if (TaskContext.get().isInterrupted()) { consumer.close(); throw new TaskKilledException(); } } it = records.iterator(); } if (it.hasNext()) { return true; } else { consumer.close(); return false; } } @Override public ExecRow next() { return (ExecRow)it.next().value(); } }; }
Example #28
Source File: SparkJavaRDD.java From incubator-nemo with Apache License 2.0 | 4 votes |
@Override public Iterator<T> iterator(final Partition split, final TaskContext taskContext) { throw new UnsupportedOperationException(NOT_YET_SUPPORTED); }
Example #29
Source File: RowDataRewriter.java From iceberg with Apache License 2.0 | 4 votes |
private TaskResult rewriteDataForTask(CombinedScanTask task) throws Exception { TaskContext context = TaskContext.get(); int partitionId = context.partitionId(); long taskId = context.taskAttemptId(); RowDataReader dataReader = new RowDataReader( task, schema, schema, nameMapping, io.value(), encryptionManager.value(), caseSensitive); SparkAppenderFactory appenderFactory = new SparkAppenderFactory( properties, schema, SparkSchemaUtil.convert(schema)); OutputFileFactory fileFactory = new OutputFileFactory( spec, format, locations, io.value(), encryptionManager.value(), partitionId, taskId); BaseWriter writer; if (spec.fields().isEmpty()) { writer = new UnpartitionedWriter(spec, format, appenderFactory, fileFactory, io.value(), Long.MAX_VALUE); } else { writer = new PartitionedWriter(spec, format, appenderFactory, fileFactory, io.value(), Long.MAX_VALUE, schema); } try { while (dataReader.next()) { InternalRow row = dataReader.get(); writer.write(row); } dataReader.close(); dataReader = null; return writer.complete(); } catch (Throwable originalThrowable) { try { LOG.error("Aborting task", originalThrowable); context.markTaskFailed(originalThrowable); LOG.error("Aborting commit for partition {} (task {}, attempt {}, stage {}.{})", partitionId, taskId, context.attemptNumber(), context.stageId(), context.stageAttemptNumber()); if (dataReader != null) { dataReader.close(); } writer.abort(); LOG.error("Aborted commit for partition {} (task {}, attempt {}, stage {}.{})", partitionId, taskId, context.taskAttemptId(), context.stageId(), context.stageAttemptNumber()); } catch (Throwable inner) { if (originalThrowable != inner) { originalThrowable.addSuppressed(inner); LOG.warn("Suppressing exception in catch: {}", inner.getMessage(), inner); } } if (originalThrowable instanceof Exception) { throw originalThrowable; } else { throw new RuntimeException(originalThrowable); } } }
Example #30
Source File: VideoStreamProcessor.java From video-stream-classification with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { //Read properties Properties prop = PropertyFileReader.readPropertyFile(); //SparkSesion SparkSession spark = SparkSession .builder() .appName("VideoStreamProcessor") .master(prop.getProperty("spark.master.url")) .getOrCreate(); //directory to save image files with motion detected final String processedImageDir = prop.getProperty("processed.output.dir"); logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file."); //create schema for json message StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("cameraId", DataTypes.StringType, true), DataTypes.createStructField("timestamp", DataTypes.TimestampType, true), DataTypes.createStructField("rows", DataTypes.IntegerType, true), DataTypes.createStructField("cols", DataTypes.IntegerType, true), DataTypes.createStructField("type", DataTypes.IntegerType, true), DataTypes.createStructField("data", DataTypes.StringType, true) }); //Create DataSet from stream messages from kafka Dataset<VideoEventData> ds = spark .readStream() .format("kafka") .option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers")) .option("subscribe", prop.getProperty("kafka.topic")) .option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes")) .option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records")) .load() .selectExpr("CAST(value AS STRING) as message") .select(functions.from_json(functions.col("message"),schema).as("json")) .select("json.*") .as(Encoders.bean(VideoEventData.class)); //key-value pair of cameraId-VideoEventData KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() { @Override public String call(VideoEventData value) throws Exception { return value.getCameraId(); } }, Encoders.STRING()); //process Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){ @Override public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception { logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId()); VideoEventData existing = null; //check previous state if (state.exists()) { existing = state.get(); } //classify image VideoEventData processed = ImageProcessor.process(key,values,processedImageDir,existing); //update last processed if(processed != null){ state.update(processed); } return processed; }}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class)); //start StreamingQuery query = processedDataset.writeStream() .outputMode("update") .format("console") .start(); //await query.awaitTermination(); }