com.google.api.services.bigquery.model.TableSchema Java Examples

The following examples show how to use com.google.api.services.bigquery.model.TableSchema. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ChangelogTableDynamicDestinations.java    From DataflowTemplates with Apache License 2.0 7 votes vote down vote up
@Override
public TableSchema getSchema(String targetTable) {
  Map<String, KV<Schema, Schema>> schemaMap = this.sideInput(schemaMapView);
  KV<Schema, Schema> keyAndValueSchemas = schemaMap.get(targetTable);

  TableFieldSchema rowSchema = new TableFieldSchema()
      .setName("fullRecord")
      .setType("RECORD")
      .setMode("NULLABLE")   // This field is null for deletions
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getValue()).getFields());

  TableFieldSchema pkSchema = new TableFieldSchema()
      .setName("primaryKey")
      .setType("RECORD")
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getKey()).getFields());

  TableSchema changelogTableSchema = new TableSchema()
      .setFields(Arrays.asList(
          rowSchema,
          pkSchema,
          new TableFieldSchema().setName("operation").setType("STRING"),
          new TableFieldSchema().setName("timestampMs").setType("INT64"),
          new TableFieldSchema().setName("tableName").setType("STRING")));

  return changelogTableSchema;
}
 
Example #2
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
@Override
public TableSchema getSchema(KV<String, TableRow> destination) {
  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example #3
Source File: BigQueryDynamicConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public TableSchema getSchema(KV<TableId, TableRow> destination) {

  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    // Why do we use checkHeaderName here and not elsewhere, TODO if we add this back in
    // fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
    fields.add(new TableFieldSchema().setName(header).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example #4
Source File: OpinionAnalysisPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * Setup step {A}
 * Helper method that defines the BigQuery schema used for the output.
 */
private static TableSchema getWebResourceSchema() {
	List<TableFieldSchema> fields = new ArrayList<>();
	fields.add(new TableFieldSchema().setName("WebResourceHash").setType("STRING").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("Url").setType("STRING"));
	fields.add(new TableFieldSchema().setName("PublicationTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("PublicationDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING"));
	fields.add(new TableFieldSchema().setName("DocumentCollectionId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("CollectionItemId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Title").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Domain").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Author").setType("STRING"));
	fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));

	TableSchema schema = new TableSchema().setFields(fields);
	return schema;
}
 
Example #5
Source File: BigQueryIOWriteTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteEmptyPCollection() throws Exception {
  TableSchema schema =
      new TableSchema()
          .setFields(
              ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")));

  p.apply(Create.empty(TableRowJsonCoder.of()))
      .apply(
          BigQueryIO.writeTableRows()
              .to("project-id:dataset-id.table-id")
              .withTestServices(fakeBqServices)
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
              .withSchema(schema)
              .withoutValidation());
  p.run();

  checkNotNull(
      fakeDatasetService.getTable(
          BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-id")));
}
 
Example #6
Source File: BigQueryIOWriteTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteValidateFailsBothFormatFunctions() {
  p.enableAbandonedNodeEnforcement(false);

  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(
      "Only one of withFormatFunction or withAvroFormatFunction/withAvroWriter maybe set, not both.");
  p.apply(Create.empty(INPUT_RECORD_CODER))
      .apply(
          BigQueryIO.<InputRecord>write()
              .to("dataset.table")
              .withSchema(new TableSchema())
              .withFormatFunction(r -> new TableRow())
              .withAvroFormatFunction(r -> new GenericData.Record(r.getSchema()))
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));
}
 
Example #7
Source File: BigQueryFeatureSink.java    From feast with Apache License 2.0 6 votes vote down vote up
/** @param featureSetSpecs Feature set to be written */
@Override
public PCollection<FeatureSetReference> prepareWrite(
    PCollection<KV<FeatureSetReference, FeatureSetProto.FeatureSetSpec>> featureSetSpecs) {
  PCollection<KV<FeatureSetReference, TableSchema>> schemas =
      featureSetSpecs
          .apply(
              "GenerateTableSchema",
              ParDo.of(
                  new FeatureSetSpecToTableSchema(
                      DatasetId.of(getProjectId(), getDatasetId()), getBQClient())))
          .setCoder(
              KvCoder.of(
                  AvroCoder.of(FeatureSetReference.class),
                  FeatureSetSpecToTableSchema.TableSchemaCoder.of()));

  schemasView =
      schemas
          .apply("ReferenceString", ParDo.of(new ReferenceToString()))
          .apply("View", View.asMultimap());

  return schemas.apply("Ready", Keys.create());
}
 
Example #8
Source File: FeatureSetSpecToTableSchema.java    From feast with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(
    @Element KV<FeatureSetReference, FeatureSetProto.FeatureSetSpec> element,
    OutputReceiver<KV<FeatureSetReference, TableSchema>> output,
    ProcessContext context) {
  String specKey = element.getKey().getReference();

  Table existingTable = getExistingTable(specKey);
  Schema schema = createSchemaFromSpec(element.getValue(), specKey, existingTable);

  if (existingTable == null) {
    createTable(specKey, schema);
  }

  output.output(KV.of(element.getKey(), serializeSchema(schema)));
}
 
Example #9
Source File: FakeJobService.java    From beam with Apache License 2.0 6 votes vote down vote up
private JobStatus runExtractJob(Job job, JobConfigurationExtract extract)
    throws InterruptedException, IOException {
  TableReference sourceTable = extract.getSourceTable();

  List<TableRow> rows =
      datasetService.getAllRows(
          sourceTable.getProjectId(), sourceTable.getDatasetId(), sourceTable.getTableId());
  TableSchema schema = datasetService.getTable(sourceTable).getSchema();
  List<Long> destinationFileCounts = Lists.newArrayList();
  for (String destination : extract.getDestinationUris()) {
    destinationFileCounts.add(writeRows(sourceTable.getTableId(), rows, schema, destination));
  }
  job.setStatistics(
      new JobStatistics()
          .setExtract(new JobStatistics4().setDestinationUriFileCounts(destinationFileCounts)));
  return new JobStatus().setState("DONE");
}
 
Example #10
Source File: BigQueryIOIT.java    From beam with Apache License 2.0 6 votes vote down vote up
private void testWrite(BigQueryIO.Write<byte[]> writeIO, String metricName) {
  Pipeline pipeline = Pipeline.create(options);

  BigQueryIO.Write.Method method = BigQueryIO.Write.Method.valueOf(options.getWriteMethod());
  pipeline
      .apply("Read from source", Read.from(new SyntheticBoundedSource(sourceOptions)))
      .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, metricName)))
      .apply("Map records", ParDo.of(new MapKVToV()))
      .apply(
          "Write to BQ",
          writeIO
              .to(tableQualifier)
              .withCustomGcsTempLocation(ValueProvider.StaticValueProvider.of(tempRoot))
              .withMethod(method)
              .withSchema(
                  new TableSchema()
                      .setFields(
                          Collections.singletonList(
                              new TableFieldSchema().setName("data").setType("BYTES")))));

  PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();
  extractAndPublishTime(pipelineResult, metricName);
}
 
Example #11
Source File: BigQueryUtilsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testToTableSchema_row() {
  TableSchema schema = toTableSchema(ROW_TYPE);

  assertThat(schema.getFields().size(), equalTo(1));
  TableFieldSchema field = schema.getFields().get(0);
  assertThat(field.getName(), equalTo("row"));
  assertThat(field.getType(), equalTo(StandardSQLTypeName.STRUCT.toString()));
  assertThat(field.getMode(), nullValue());
  assertThat(
      field.getFields(),
      containsInAnyOrder(
          ID,
          VALUE,
          NAME,
          TIMESTAMP_VARIANT1,
          TIMESTAMP_VARIANT2,
          TIMESTAMP_VARIANT3,
          TIMESTAMP_VARIANT4,
          VALID,
          BINARY));
}
 
Example #12
Source File: BigQueryLoader.java    From quetzal with Eclipse Public License 2.0 6 votes vote down vote up
public static void write(String table, TableSchema schema, PCollection<JSONObject> data) {
	data.apply("convert to TableRow", ParDo.of(new DoFn<JSONObject,TableRow>() {
		private static final long serialVersionUID = -4204128594221801617L;
		@SuppressWarnings("unchecked")
		@ProcessElement
		public void processElement(ProcessContext c) {
			JSONObject obj = c.element();
			TableRow x = new TableRow();
			obj.keySet().forEach((Object key) -> {
				x.set((String) key, obj.get(key));
			});
			c.output(x);
		}
	})).apply(BigQueryIO.Write
			.withTableDescription(table)
			.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)
			.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
			.withSchema(schema)
			.to(table));
}
 
Example #13
Source File: BigQueryOutputConfiguration.java    From hadoop-connectors with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the output table schema based on the given configuration.
 *
 * @param conf the configuration to reference the keys from.
 * @return the derived table schema, absent value if no table schema exists in the configuration.
 * @throws IOException if a table schema was set in the configuration but couldn't be parsed.
 */
static Optional<BigQueryTableSchema> getTableSchema(Configuration conf) throws IOException {
  String fieldsJson = OUTPUT_TABLE_SCHEMA.get(conf, conf::get);
  if (!Strings.isNullOrEmpty(fieldsJson)) {
    try {
      TableSchema tableSchema = BigQueryTableHelper.createTableSchemaFromFields(fieldsJson);
      return Optional.of(BigQueryTableSchema.wrap(tableSchema));
    } catch (IOException e) {
      throw new IOException("Unable to parse key '" + OUTPUT_TABLE_SCHEMA.getKey() + "'.", e);
    }
  }
  return Optional.empty();
}
 
Example #14
Source File: BigQueryIOWriteTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private void testWriteValidatesDataset(boolean unbounded) throws Exception {
  TableReference tableRef = new TableReference();
  tableRef.setDatasetId("somedataset");
  tableRef.setTableId("sometable");

  PCollection<TableRow> tableRows;
  if (unbounded) {
    tableRows =
        p.apply(GenerateSequence.from(0))
            .apply(
                MapElements.via(
                    new SimpleFunction<Long, TableRow>() {
                      @Override
                      public TableRow apply(Long input) {
                        return null;
                      }
                    }))
            .setCoder(TableRowJsonCoder.of());
  } else {
    tableRows = p.apply(Create.empty(TableRowJsonCoder.of()));
  }

  thrown.expect(RuntimeException.class);
  // Message will be one of following depending on the execution environment.
  thrown.expectMessage(
      Matchers.either(Matchers.containsString("Unable to confirm BigQuery dataset presence"))
          .or(Matchers.containsString("BigQuery dataset not found for table")));
  tableRows.apply(
      BigQueryIO.writeTableRows()
          .to(tableRef)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
          .withSchema(new TableSchema())
          .withTestServices(fakeBqServices));
  p.run();
}
 
Example #15
Source File: BigQueryIOWriteTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testFailuresNoRetryPolicy() throws Exception {
  TableRow row1 = new TableRow().set("name", "a").set("number", "1");
  TableRow row2 = new TableRow().set("name", "b").set("number", "2");
  TableRow row3 = new TableRow().set("name", "c").set("number", "3");

  TableDataInsertAllResponse.InsertErrors ephemeralError =
      new TableDataInsertAllResponse.InsertErrors()
          .setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));

  fakeDatasetService.failOnInsert(
      ImmutableMap.of(
          row1, ImmutableList.of(ephemeralError, ephemeralError),
          row2, ImmutableList.of(ephemeralError, ephemeralError)));

  p.apply(Create.of(row1, row2, row3))
      .apply(
          BigQueryIO.writeTableRows()
              .to("project-id:dataset-id.table-id")
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
              .withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS)
              .withSchema(
                  new TableSchema()
                      .setFields(
                          ImmutableList.of(
                              new TableFieldSchema().setName("name").setType("STRING"),
                              new TableFieldSchema().setName("number").setType("INTEGER"))))
              .withTestServices(fakeBqServices)
              .withoutValidation());
  p.run();

  assertThat(
      fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"),
      containsInAnyOrder(row1, row2, row3));
}
 
Example #16
Source File: ExampleUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
private void setupBigQueryTable(
    String projectId, String datasetId, String tableId, TableSchema schema) throws IOException {
  if (bigQueryClient == null) {
    bigQueryClient = newBigQueryClient(options.as(BigQueryOptions.class)).build();
  }

  Datasets datasetService = bigQueryClient.datasets();
  if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
    Dataset newDataset =
        new Dataset()
            .setDatasetReference(
                new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
    datasetService.insert(projectId, newDataset).execute();
  }

  Tables tableService = bigQueryClient.tables();
  Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
  if (table == null) {
    Table newTable =
        new Table()
            .setSchema(schema)
            .setTableReference(
                new TableReference()
                    .setProjectId(projectId)
                    .setDatasetId(datasetId)
                    .setTableId(tableId));
    tableService.insert(projectId, datasetId, newTable).execute();
  } else if (!table.getSchema().equals(schema)) {
    throw new RuntimeException(
        "Table exists and schemas do not match, expecting: "
            + schema.toPrettyString()
            + ", actual: "
            + table.getSchema().toPrettyString());
  }
}
 
Example #17
Source File: BigQuerySchemaUpdateOptionsIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a write test against a BigQuery table to check that SchemaUpdateOption sets are taking
 * effect.
 *
 * <p>Attempt write a row via BigQueryIO.writeTables with the given params, then run the given
 * query, and finaly check the results of the query.
 *
 * @param schemaUpdateOptions The SchemaUpdateOption set to use
 * @param tableName The table to write to
 * @param schema The schema to use for the table
 * @param rowToInsert The row to insert
 * @param testQuery A testing SQL query to run after writing the row
 * @param expectedResult The expected result of the query as a nested list of column values (one
 *     list per result row)
 */
private void runWriteTest(
    Set<SchemaUpdateOption> schemaUpdateOptions,
    String tableName,
    TableSchema schema,
    TableRow rowToInsert,
    String testQuery,
    List<List<String>> expectedResult)
    throws Exception {
  Options options = TestPipeline.testingPipelineOptions().as(Options.class);
  options.setTempLocation(options.getTempRoot() + "/bq_it_temp");

  Pipeline p = Pipeline.create(options);
  Create.Values<TableRow> input = Create.<TableRow>of(rowToInsert);

  Write<TableRow> writer =
      BigQueryIO.writeTableRows()
          .to(String.format("%s:%s.%s", options.getProject(), BIG_QUERY_DATASET_ID, tableName))
          .withSchema(schema)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
          .withSchemaUpdateOptions(schemaUpdateOptions);

  p.apply(input).apply(writer);
  p.run().waitUntilFinish();

  QueryResponse response = BQ_CLIENT.queryWithRetries(testQuery, project);

  List<List<String>> result =
      response.getRows().stream()
          .map(
              row ->
                  row.getF().stream()
                      .map(cell -> cell.getV().toString())
                      .collect(Collectors.toList()))
          .collect(Collectors.toList());

  assertEquals(expectedResult, result);
}
 
Example #18
Source File: FilterExamples.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Helper method to build the table schema for the output table. */
private static TableSchema buildWeatherSchemaProjection() {
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName("year").setType("INTEGER"));
  fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
  fields.add(new TableFieldSchema().setName("day").setType("INTEGER"));
  fields.add(new TableFieldSchema().setName("mean_temp").setType("FLOAT"));
  return new TableSchema().setFields(fields);
}
 
Example #19
Source File: BigQueryIOStorageReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testTableSourceInitialSplit_EmptyTable() throws Exception {
  fakeDatasetService.createDataset("foo.com:project", "dataset", "", "", null);
  TableReference tableRef = BigQueryHelpers.parseTableSpec("foo.com:project:dataset.table");

  Table table =
      new Table()
          .setTableReference(tableRef)
          .setNumBytes(1024L * 1024L)
          .setSchema(new TableSchema());

  fakeDatasetService.createTable(table);

  CreateReadSessionRequest expectedRequest =
      CreateReadSessionRequest.newBuilder()
          .setParent("projects/project-id")
          .setTableReference(BigQueryHelpers.toTableRefProto(tableRef))
          .setRequestedStreams(1024)
          .setShardingStrategy(ShardingStrategy.BALANCED)
          .build();

  ReadSession emptyReadSession = ReadSession.newBuilder().build();
  StorageClient fakeStorageClient = mock(StorageClient.class);
  when(fakeStorageClient.createReadSession(expectedRequest)).thenReturn(emptyReadSession);

  BigQueryStorageTableSource<TableRow> tableSource =
      BigQueryStorageTableSource.create(
          ValueProvider.StaticValueProvider.of(tableRef),
          null,
          null,
          null,
          new TableRowParser(),
          TableRowJsonCoder.of(),
          new FakeBigQueryServices()
              .withDatasetService(fakeDatasetService)
              .withStorageClient(fakeStorageClient));

  List<? extends BoundedSource<TableRow>> sources = tableSource.split(1024L, options);
  assertTrue(sources.isEmpty());
}
 
Example #20
Source File: BigQueryHllSketchCompatibilityIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void prepareDatasetAndDataTables() throws Exception {
  BIGQUERY_CLIENT.createNewDataset(PROJECT_ID, DATASET_ID);

  TableSchema dataTableSchema =
      new TableSchema()
          .setFields(
              Collections.singletonList(
                  new TableFieldSchema().setName(DATA_FIELD_NAME).setType(DATA_FIELD_TYPE)));

  Table dataTableNonEmpty =
      new Table()
          .setSchema(dataTableSchema)
          .setTableReference(
              new TableReference()
                  .setProjectId(PROJECT_ID)
                  .setDatasetId(DATASET_ID)
                  .setTableId(DATA_TABLE_ID_NON_EMPTY));
  BIGQUERY_CLIENT.createNewTable(PROJECT_ID, DATASET_ID, dataTableNonEmpty);
  // Prepopulates dataTableNonEmpty with TEST_DATA
  List<Map<String, Object>> rows =
      TEST_DATA.stream()
          .map(v -> Collections.singletonMap(DATA_FIELD_NAME, (Object) v))
          .collect(Collectors.toList());
  BIGQUERY_CLIENT.insertDataToTable(PROJECT_ID, DATASET_ID, DATA_TABLE_ID_NON_EMPTY, rows);

  Table dataTableEmpty =
      new Table()
          .setSchema(dataTableSchema)
          .setTableReference(
              new TableReference()
                  .setProjectId(PROJECT_ID)
                  .setDatasetId(DATASET_ID)
                  .setTableId(DATA_TABLE_ID_EMPTY));
  BIGQUERY_CLIENT.createNewTable(PROJECT_ID, DATASET_ID, dataTableEmpty);
}
 
Example #21
Source File: BigQueryLoader.java    From quetzal with Eclipse Public License 2.0 5 votes vote down vote up
public static TableSchema datatypeTable() {
	TableSchema x = new TableSchema();
	x.setFields(
			Arrays.asList(
					field("datatype_name", "STRING"),
					field("datatype_id", "INTEGER")));
	return x;
}
 
Example #22
Source File: FakeJobService.java    From beam with Apache License 2.0 5 votes vote down vote up
private List<TableRow> readAvroTableRows(String filename, TableSchema tableSchema)
    throws IOException {
  List<TableRow> tableRows = Lists.newArrayList();
  FileReader<GenericRecord> dfr =
      DataFileReader.openReader(new File(filename), new GenericDatumReader<>());

  while (dfr.hasNext()) {
    GenericRecord record = dfr.next(null);
    tableRows.add(BigQueryUtils.convertGenericRecordToTableRow(record, tableSchema));
  }
  return tableRows;
}
 
Example #23
Source File: BQETLNested.java    From bigquery-etl-dataflow-sample with Apache License 2.0 5 votes vote down vote up
private static TableSchema bqSchema() {
  return FieldSchemaListBuilder.create()
      .intField("artist_id")
      .stringField("artist_gid")
      .stringField("artist_name")
      .stringField("artist_sort_name")
      .intField("artist_begin_date_year")
      .intField("artist_begin_date_month")
      .intField("artist_begin_date_day")
      .intField("artist_end_date_year")
      .intField("artist_end_date_month")
      .intField("artist_end_date_day")
      .intField("artist_type")
      .stringField("artist_area")
      .stringField("artist_gender")
      .intField("artist_edits_pending")
      .timestampField("artist_last_updated")
      .stringField("artist_comment")
      .boolField("artist_ended")
      .stringField("artist_begin_area")
      .field(FieldSchemaListBuilder.create()
          .intField("artist_credit_name_artist_credit")
          .intField("artist_credit_name_position")
          .intField("artist_credit_name_artist")
          .stringField("artist_credit_name_name")
          .stringField("artist_credit_name_join_phrase")
          .intField("recording_id")
          .stringField("recording_gid")
          .stringField("recording_name")
          .intField("recording_length")
          .stringField("recording_comment")
          .intField("recording_edits_pending")
          .timestampField("recording_last_updated")
          .boolField("recording_video")
          .repeatedRecord("artist_recordings")).schema();

}
 
Example #24
Source File: BigQueryRowCountIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testNonEmptyTable() {
  BigQueryTableProvider provider = new BigQueryTableProvider();
  Table table = getTable("testTable", bigQuery.tableSpec());

  pipeline
      .apply(
          Create.of(
                  new TableRow().set("id", 1).set("name", "name1"),
                  new TableRow().set("id", 2).set("name", "name2"),
                  new TableRow().set("id", 3).set("name", "name3"))
              .withCoder(TableRowJsonCoder.of()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(bigQuery.tableSpec())
              .withSchema(
                  new TableSchema()
                      .setFields(
                          ImmutableList.of(
                              new TableFieldSchema().setName("id").setType("INTEGER"),
                              new TableFieldSchema().setName("name").setType("STRING"))))
              .withoutValidation());
  pipeline.run().waitUntilFinish();

  BeamSqlTable sqlTable = provider.buildBeamSqlTable(table);
  BeamTableStatistics size1 = sqlTable.getTableStatistics(TestPipeline.testingPipelineOptions());

  assertNotNull(size1);
  assertEquals(3d, size1.getRowCount(), 0.1);
}
 
Example #25
Source File: BigQuerySchemaUpdateOptionsIT.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllowFieldAddition() throws Exception {
  String tableName = makeTestTable();

  Set<SchemaUpdateOption> schemaUpdateOptions =
      EnumSet.of(BigQueryIO.Write.SchemaUpdateOption.ALLOW_FIELD_ADDITION);

  TableSchema newSchema =
      new TableSchema()
          .setFields(
              ImmutableList.of(
                  new TableFieldSchema().setName("new_field").setType("STRING"),
                  new TableFieldSchema().setName("optional_field").setType("STRING"),
                  new TableFieldSchema()
                      .setName("required_field")
                      .setType("STRING")
                      .setMode("REQUIRED")));

  String[] values = {"meow", "bark"};
  TableRow rowToInsert =
      new TableRow().set("new_field", values[0]).set("required_field", values[1]);

  String testQuery =
      String.format(
          "SELECT new_field, required_field FROM [%s.%s];", BIG_QUERY_DATASET_ID, tableName);

  List<List<String>> expectedResult = Arrays.asList(Arrays.asList(values));
  runWriteTest(schemaUpdateOptions, tableName, newSchema, rowToInsert, testQuery, expectedResult);
}
 
Example #26
Source File: BigQueryTableHelper.java    From hadoop-connectors with Apache License 2.0 5 votes vote down vote up
/**
 * Creates {@link TableSchema} from the JSON representation of the table fields.
 *
 * @param fieldsJson JSON fields to convert to {@link TableSchema}
 * @return {@link TableSchema}
 * @throws IOException
 */
static TableSchema createTableSchemaFromFields(String fieldsJson) throws IOException {
  List<TableFieldSchema> fields = new ArrayList<>();
  JsonParser parser = JacksonFactory.getDefaultInstance().createJsonParser(fieldsJson);
  parser.parseArrayAndClose(fields, TableFieldSchema.class);

  return new TableSchema().setFields(fields);
}
 
Example #27
Source File: BigQueryRowCountIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/** This tests if the pipeline options are injected in the path of SQL Transform. */
@Test
public void testPipelineOptionInjection() {
  BigQueryTestTableProvider provider = new BigQueryTestTableProvider();
  Table table = getTable("testTable", bigQuery.tableSpec());
  provider.addTable("testTable", table);

  pipeline
      .apply(
          Create.of(
                  new TableRow().set("id", 1).set("name", "name1"),
                  new TableRow().set("id", 2).set("name", "name2"),
                  new TableRow().set("id", 3).set("name", "name3"))
              .withCoder(TableRowJsonCoder.of()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(bigQuery.tableSpec())
              .withSchema(
                  new TableSchema()
                      .setFields(
                          ImmutableList.of(
                              new TableFieldSchema().setName("id").setType("INTEGER"),
                              new TableFieldSchema().setName("name").setType("STRING"))))
              .withoutValidation());
  pipeline.run().waitUntilFinish();

  // changing pipeline options
  readingPipeline.getOptions().setJobName(FAKE_JOB_NAME);

  // Reading from the table should update the statistics of bigQuery table
  readingPipeline.apply(
      SqlTransform.query(" select * from testTable ")
          .withDefaultTableProvider("bigquery", provider));

  readingPipeline.run().waitUntilFinish();

  BigQueryTestTable sqlTable = (BigQueryTestTable) provider.buildBeamSqlTable(table);
  assertEquals(FAKE_JOB_NAME, sqlTable.getJobName());
}
 
Example #28
Source File: BigQuerySourceBase.java    From beam with Apache License 2.0 5 votes vote down vote up
protected ExtractResult extractFiles(PipelineOptions options) throws Exception {
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  TableReference tableToExtract = getTableToExtract(bqOptions);
  BigQueryServices.DatasetService datasetService = bqServices.getDatasetService(bqOptions);
  Table table = datasetService.getTable(tableToExtract);
  if (table == null) {
    throw new IOException(
        String.format(
            "Cannot start an export job since table %s does not exist",
            BigQueryHelpers.toTableSpec(tableToExtract)));
  }

  TableSchema schema = table.getSchema();
  JobService jobService = bqServices.getJobService(bqOptions);
  String extractJobId = getExtractJobId(createJobIdToken(options.getJobName(), stepUuid));
  final String extractDestinationDir =
      resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", stepUuid);
  String bqLocation =
      BigQueryHelpers.getDatasetLocation(
          datasetService, tableToExtract.getProjectId(), tableToExtract.getDatasetId());
  List<ResourceId> tempFiles =
      executeExtract(
          extractJobId,
          tableToExtract,
          jobService,
          bqOptions.getProject(),
          extractDestinationDir,
          bqLocation);
  return new ExtractResult(schema, tempFiles);
}
 
Example #29
Source File: RowWriterFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
private AvroRowWriterFactory(
    SerializableFunction<AvroWriteRequest<ElementT>, AvroT> toAvro,
    SerializableFunction<Schema, DatumWriter<AvroT>> writerFactory,
    SerializableFunction<TableSchema, Schema> schemaFactory,
    DynamicDestinations<?, DestinationT> dynamicDestinations) {
  this.toAvro = toAvro;
  this.writerFactory = writerFactory;
  this.schemaFactory = schemaFactory;
  this.dynamicDestinations = dynamicDestinations;
}
 
Example #30
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity with a valid key when the
 * unique name column is string.
 */
@Test
public void testAvroToEntityStringIdColumn() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "string", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, idFieldValueStr);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  assertTrue(outputEntity.hasKey());
  assertEquals(idFieldValueStr, outputEntity.getKey().getPath(0).getName());
  validateMetadata(outputEntity);
}