Java Code Examples for org.apache.spark.sql.catalyst.expressions.GenericInternalRow

The following examples show how to use org.apache.spark.sql.catalyst.expressions.GenericInternalRow. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: spark-bigquery-connector   Source File: SchemaConverters.java    License: Apache License 2.0 6 votes vote down vote up
static GenericInternalRow convertAll(FieldList fieldList,
                                     GenericRecord record,
                                     List<String> namesInOrder) {

    Map<String, Object> fieldMap = new HashMap<>();

    fieldList.stream().forEach(field ->
            fieldMap.put(field.getName(), convert(field, record.get(field.getName()))));

    Object[] values = new Object[namesInOrder.size()];
    for (int i = 0; i < namesInOrder.size(); i++) {
        values[i] = fieldMap.get(namesInOrder.get(i));
    }

    return new GenericInternalRow(values);
}
 
Example 2
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 6 votes vote down vote up
PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) {
  StructType partitionType = SparkSchemaUtil.convert(partitionSchema);
  StructField[] fields = partitionType.fields();

  this.types = new DataType[fields.length];
  this.positions = new int[types.length];
  this.javaTypes = new Class<?>[types.length];
  this.reusedRow = new GenericInternalRow(types.length);

  List<PartitionField> partitionFields = spec.fields();
  for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) {
    this.types[rowIndex] = fields[rowIndex].dataType();

    int sourceId = partitionSchema.columns().get(rowIndex).fieldId();
    for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) {
      PartitionField field = spec.fields().get(specIndex);
      if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) {
        positions[rowIndex] = specIndex;
        javaTypes[rowIndex] = spec.javaClasses()[specIndex];
        break;
      }
    }
  }
}
 
Example 3
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected GenericInternalRow newStructData(InternalRow reuse) {
  if (reuse instanceof GenericInternalRow) {
    return (GenericInternalRow) reuse;
  } else {
    return new GenericInternalRow(numFields);
  }
}
 
Example 4
Source Project: iceberg   Source File: SparkValueReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected InternalRow reuseOrCreate(Object reuse) {
  if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) {
    return (InternalRow) reuse;
  }
  return new GenericInternalRow(numFields);
}
 
Example 5
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected GenericInternalRow newStructData(InternalRow reuse) {
  if (reuse instanceof GenericInternalRow) {
    return (GenericInternalRow) reuse;
  } else {
    return new GenericInternalRow(numFields);
  }
}
 
Example 6
Source Project: rdf2x   Source File: SparkContextProvider.java    License: Apache License 2.0 5 votes vote down vote up
public static Class[] getSerializableClasses() {
    return new Class[]{
            Instance.class, Predicate.class, RelationPredicate.class, RelationRow.class,
            TypeID.class, HashMap.class, HashSet.class, LiteralType.class, Object[].class,
            InternalRow[].class, GenericInternalRow.class, IndexMap.class, Quad.class
    };
}
 
Example 7
Source Project: geowave   Source File: AbstractGeometryUDT.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public InternalRow serialize(final T obj) {
  final byte[] bytes = new TWKBWriter().write(obj);
  final InternalRow returnRow = new GenericInternalRow(bytes.length);
  returnRow.update(0, bytes);
  return returnRow;
}
 
Example 8
Source Project: ambiverse-nlu   Source File: Data2CoNLL.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected int run() throws Exception {

  SparkConf sparkConf = new SparkConf()
      .setAppName("Data2CoNLL")
      .set("spark.hadoop.validateOutputSpecs", "false")
      .set("spark.yarn.executor.memoryOverhead", "3072")
      .set("spark.rdd.compress", "true")
      .set("spark.core.connection.ack.wait.timeout", "600")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      //.set("spark.kryo.registrationRequired", "true")
      .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
          InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
          VectorIndexer.class})
      ;//.setMaster("local[4]"); //Remove this if you run it on the server.


  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
      * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

  FileSystem fs = FileSystem.get(new Configuration());

  int partitionNumber = 3 * totalCores;
  if(partitions != null) {
    partitionNumber = partitions;
  }

  //Read training documents serialized as SCAS
  JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

  JavaRDD<String> docStrings = documents.map( s -> {
    JCas jCas = s.getJCas();
    NYTArticleMetaData metadata = JCasUtil.selectSingle(jCas, NYTArticleMetaData.class);

    StringJoiner docBuilder = new StringJoiner("\n");

    docBuilder.add("-DOCSTART- (" +  metadata.getGuid() + ")");
    docBuilder.add("");

    Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
    for(Sentence sentence: sentences) {
      List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
      for(Token token: tokens) {
        CoreLabel taggedWord = CoreNlpUtils.tokenToWord(token);
        StringJoiner lineBuilder = new StringJoiner("\t");
        lineBuilder.add(taggedWord.word().toLowerCase());
        docBuilder.add(lineBuilder.toString());
      }
      docBuilder.add("");
    }
    return docBuilder.toString();
  });

  docStrings.saveAsTextFile(output);
  sc.stop();
  return 0;
}
 
Example 9
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                .set("spark.yarn.executor.memoryOverhead", "3072")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//.setMaster("local[4]"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(folds != null) {
            trainingSettings.setNumFolds(folds);
        }
        if(method != null) {
            trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.valueOf(method));
        }
        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }

        if(scalingFactor != null) {
            trainingSettings.setPositiveInstanceScalingFactor(scalingFactor);
        }

        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 4;
////        trainingSettings.setFeatureExtractor(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE);
////        trainingSettings.setAidaDefaultConf("db");
//        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);
//        trainingSettings.setPositiveInstanceScalingFactor(1);

        //Add the cache files to each node only if annotation is required.
        //The input documents could already be annotated, and in this case no caches are needed.
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        FileSystem fs = FileSystem.get(new Configuration());

        int partitionNumber = 3 * totalCores;
        if(partitions != null) {
            partitionNumber = partitions;
        }

        //Read training documents serialized as SCAS
        JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();

        //Train a model
        CrossValidatorModel model = trainingSparkRunner.crossValidate(sc, sqlContext, documents, trainingSettings);


        //Create the model path
        String modelPath = output+"/"+sc.getConf().getAppId()+"/model_"+trainingSettings.getClassificationMethod();

        //Delete the old model if there is one
        fs.delete(new Path(modelPath), true);

        //Save the new model model
        List<Model> models = new ArrayList<>();
        models.add(model.bestModel());
        sc.parallelize(models, 1).saveAsObjectFile(modelPath);

        //Save the model stats
        SparkClassificationModel.saveStats(model, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");


        return 0;
    }
 
Example 10
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                //.set("spark.yarn.executor.memoryOverhead", "4096")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//setMaster("local"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }


        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 2;

        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);

        trainingSettings.setPositiveInstanceScalingFactor(1);
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        int partitionNumber = 3 * totalCores;
        //Read training documents serialized as SCAS
        JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber);

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();


        PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first();

        //Evaluate the model and write down the evaluation metrics.
        trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");

        return 0;
    }
 
Example 11
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected Object getField(GenericInternalRow intermediate, int pos) {
  return intermediate.genericGet(pos);
}
 
Example 12
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected InternalRow buildStruct(GenericInternalRow struct) {
  return struct;
}
 
Example 13
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void set(GenericInternalRow row, int pos, Object value) {
  row.update(pos, value);
}
 
Example 14
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setNull(GenericInternalRow row, int pos) {
  row.setNullAt(pos);
}
 
Example 15
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setBoolean(GenericInternalRow row, int pos, boolean value) {
  row.setBoolean(pos, value);
}
 
Example 16
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setInteger(GenericInternalRow row, int pos, int value) {
  row.setInt(pos, value);
}
 
Example 17
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setLong(GenericInternalRow row, int pos, long value) {
  row.setLong(pos, value);
}
 
Example 18
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setFloat(GenericInternalRow row, int pos, float value) {
  row.setFloat(pos, value);
}
 
Example 19
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setDouble(GenericInternalRow row, int pos, double value) {
  row.setDouble(pos, value);
}
 
Example 20
Source Project: iceberg   Source File: SparkOrcValueReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected InternalRow create() {
  return new GenericInternalRow(numFields);
}
 
Example 21
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected Object getField(GenericInternalRow intermediate, int pos) {
  return intermediate.genericGet(pos);
}
 
Example 22
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected InternalRow buildStruct(GenericInternalRow struct) {
  return struct;
}
 
Example 23
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void set(GenericInternalRow row, int pos, Object value) {
  row.update(pos, value);
}
 
Example 24
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setNull(GenericInternalRow row, int pos) {
  row.setNullAt(pos);
}
 
Example 25
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setBoolean(GenericInternalRow row, int pos, boolean value) {
  row.setBoolean(pos, value);
}
 
Example 26
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setInteger(GenericInternalRow row, int pos, int value) {
  row.setInt(pos, value);
}
 
Example 27
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setLong(GenericInternalRow row, int pos, long value) {
  row.setLong(pos, value);
}
 
Example 28
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setFloat(GenericInternalRow row, int pos, float value) {
  row.setFloat(pos, value);
}
 
Example 29
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void setDouble(GenericInternalRow row, int pos, double value) {
  row.setDouble(pos, value);
}