Java Code Examples for org.apache.spark.TaskContext#getPartitionId()

The following examples show how to use org.apache.spark.TaskContext#getPartitionId() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamNodeLoader.java    From sylph with Apache License 2.0 6 votes vote down vote up
public static Iterator<Row> transFunction(Iterator<Row> partition, RealTimeTransForm realTimeTransForm)
{
    Exception errorOrNull = null;
    Schema schema = realTimeTransForm.getSchema(); // if not null
    List<Row> list = new ArrayList<>();
    try {
        int partitionId = TaskContext.getPartitionId();
        if (realTimeTransForm.open(partitionId, 0)) {
            partition.forEachRemaining(row -> {
                realTimeTransForm.process(SparkRecord.make(row), (transOutrow) -> {
                    //TODO: SparkRow.parserRow(x) with schema ?
                    list.add(SparkRecord.parserRow(transOutrow));
                });
            });
        }
    }
    catch (Exception e) {
        errorOrNull = e; //转换失败 这批数据都丢弃
    }
    finally {
        realTimeTransForm.close(errorOrNull); //destroy()
    }
    return list.iterator();
}
 
Example 2
Source File: ExpKeyFilenameMap.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
@Override
public Iterator<Tuple2<Integer, String>> call(Iterator<Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>>> iter) throws Exception
{
  List<Tuple2<Integer,String>> keyFileList = new ArrayList<>();

  FileSystem fs = FileSystem.get(new Configuration());

  // Form the filename for the exp table portion that corresponds to this partition
  int taskId = TaskContext.getPartitionId();
  logger.info("taskId = " + taskId);

  String fileName = expOutDir + "/exp-" + String.format("%05d", taskId);
  logger.info("fileName = " + fileName);

  // Iterate over the elements of the partition
  BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(fileName), true)));
  while (iter.hasNext())
  {
    // <queryHash, <<power>,<element^power mod N^2>>
    Tuple2<Integer,Iterable<Tuple2<Integer,BigInteger>>> expTuple = iter.next();
    int queryHash = expTuple._1;

    // Record the queryHash -> fileName
    keyFileList.add(new Tuple2<>(queryHash, fileName));

    // Write the partition elements to the corresponding exp table file
    // each line: queryHash,<power>-<element^power mod N^2>
    for (Tuple2<Integer,BigInteger> modPow : expTuple._2)
    {
      String lineOut = queryHash + "," + modPow._1 + "-" + modPow._2;
      bw.write(lineOut);
      bw.newLine();
    }
  }
  bw.close();

  return keyFileList.iterator();
}
 
Example 3
Source File: SparkFactDistinct.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}
 
Example 4
Source File: SparkTaskContextSupplier.java    From hudi with Apache License 2.0 4 votes vote down vote up
public Supplier<Integer> getPartitionIdSupplier() {
  return () -> TaskContext.getPartitionId();
}
 
Example 5
Source File: SparkFactDistinct.java    From kylin with Apache License 2.0 4 votes vote down vote up
private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}