com.google.api.services.bigquery.model.TableFieldSchema Java Examples

The following examples show how to use com.google.api.services.bigquery.model.TableFieldSchema. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BigQueryUtils.java    From beam with Apache License 2.0 7 votes vote down vote up
private static Schema fromTableFieldSchema(List<TableFieldSchema> tableFieldSchemas) {
  Schema.Builder schemaBuilder = Schema.builder();
  for (TableFieldSchema tableFieldSchema : tableFieldSchemas) {
    FieldType fieldType =
        fromTableFieldSchemaType(tableFieldSchema.getType(), tableFieldSchema.getFields());

    Optional<Mode> fieldMode = Optional.ofNullable(tableFieldSchema.getMode()).map(Mode::valueOf);
    if (fieldMode.filter(m -> m == Mode.REPEATED).isPresent()) {
      fieldType = FieldType.array(fieldType);
    }

    // if the mode is not defined or if it is set to NULLABLE, then the field is nullable
    boolean nullable =
        !fieldMode.isPresent() || fieldMode.filter(m -> m == Mode.NULLABLE).isPresent();
    Field field = Field.of(tableFieldSchema.getName(), fieldType).withNullable(nullable);
    if (tableFieldSchema.getDescription() != null
        && !"".equals(tableFieldSchema.getDescription())) {
      field = field.withDescription(tableFieldSchema.getDescription());
    }
    schemaBuilder.addField(field);
  }
  return schemaBuilder.build();
}
 
Example #2
Source File: ChangelogTableDynamicDestinations.java    From DataflowTemplates with Apache License 2.0 7 votes vote down vote up
@Override
public TableSchema getSchema(String targetTable) {
  Map<String, KV<Schema, Schema>> schemaMap = this.sideInput(schemaMapView);
  KV<Schema, Schema> keyAndValueSchemas = schemaMap.get(targetTable);

  TableFieldSchema rowSchema = new TableFieldSchema()
      .setName("fullRecord")
      .setType("RECORD")
      .setMode("NULLABLE")   // This field is null for deletions
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getValue()).getFields());

  TableFieldSchema pkSchema = new TableFieldSchema()
      .setName("primaryKey")
      .setType("RECORD")
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getKey()).getFields());

  TableSchema changelogTableSchema = new TableSchema()
      .setFields(Arrays.asList(
          rowSchema,
          pkSchema,
          new TableFieldSchema().setName("operation").setType("STRING"),
          new TableFieldSchema().setName("timestampMs").setType("INT64"),
          new TableFieldSchema().setName("tableName").setType("STRING")));

  return changelogTableSchema;
}
 
Example #3
Source File: BigQueryAvroUtils.java    From beam with Apache License 2.0 6 votes vote down vote up
private static TableRow convertGenericRecordToTableRow(
    GenericRecord record, List<TableFieldSchema> fields) {
  TableRow row = new TableRow();
  for (TableFieldSchema subSchema : fields) {
    // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the name field
    // is required, so it may not be null.
    Field field = record.getSchema().getField(subSchema.getName());
    Object convertedValue =
        getTypedCellValue(field.schema(), subSchema, record.get(field.name()));
    if (convertedValue != null) {
      // To match the JSON files exported by BigQuery, do not include null values in the output.
      row.set(field.name(), convertedValue);
    }
  }

  return row;
}
 
Example #4
Source File: BigQueryDynamicConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public TableSchema getSchema(KV<TableId, TableRow> destination) {

  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    // Why do we use checkHeaderName here and not elsewhere, TODO if we add this back in
    // fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
    fields.add(new TableFieldSchema().setName(header).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example #5
Source File: BigQueryDynamicConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public TableSchema getSchema(KV<TableId, TableRow> destination) {

  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    // Why do we use checkHeaderName here and not elsewhere, TODO if we add this back in
    // fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
    fields.add(new TableFieldSchema().setName(header).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example #6
Source File: BigQueryIOWriteTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteEmptyPCollection() throws Exception {
  TableSchema schema =
      new TableSchema()
          .setFields(
              ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")));

  p.apply(Create.empty(TableRowJsonCoder.of()))
      .apply(
          BigQueryIO.writeTableRows()
              .to("project-id:dataset-id.table-id")
              .withTestServices(fakeBqServices)
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
              .withSchema(schema)
              .withoutValidation());
  p.run();

  checkNotNull(
      fakeDatasetService.getTable(
          BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-id")));
}
 
Example #7
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that BigQueryConverters.validateKeyColumn() throws IllegalArgumentException when the
 * BigQuery column NULL.
 */
@Test
public void testValidateKeyColumnNull() {
  TableFieldSchema column = new TableFieldSchema().setName(nullField).setType("STRING");
  Record record = generateSingleFieldAvroRecord(nullField, "null", nullFieldDesc, null);
  boolean isThrown = false;
  String message = null;
  try {
    BigQueryConverters.validateKeyColumn(column, record.get(nullField));
  } catch (IllegalArgumentException e) {
    isThrown = true;
    message = e.getMessage();
  }
  assertTrue(isThrown);
  assertTrue(message != null);
  assertEquals(
      message,
      String.format("Column [%s] with NULL value cannot be set as Entity name.", nullField));
}
 
Example #8
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that BigQueryConverters.validateKeyColumn() throws IllegalArgumentException when the
 * BigQuery column is a STRING exceeding 1500 bytes.
 */
@Test
public void testValidateKeyColumnStringLong() {
  TableFieldSchema column = new TableFieldSchema().setName(longStringField).setType("STRING");
  Record record =
      generateSingleFieldAvroRecord(
          longStringField, "string", longStringFieldDesc, longStringFieldValue);
  boolean isThrown = false;
  String message = null;
  try {
    BigQueryConverters.validateKeyColumn(column, record.get(longStringField));
  } catch (IllegalArgumentException e) {
    isThrown = true;
    message = e.getMessage();
  }
  assertTrue(isThrown);
  assertTrue(message != null);
  assertEquals(
      message,
      String.format(
          "Column [%s] exceeding %d bytes cannot be set as Entity name.",
          longStringField, BigQueryConverters.MAX_STRING_SIZE_BYTES));
}
 
Example #9
Source File: BigQueryUtilsTest.java    From hadoop-connectors with Apache License 2.0 6 votes vote down vote up
/**
 * Tests getSchemaFromString method of BigQueryUtils for simple schema.
 */
@Test
public void testGetSchemaFromString() {
  // Set fields schema for testing.
  String fields =
      "[{'name': 'MyName', 'type': 'STRING'},"
      + "{'name': 'Number', 'type': 'INTEGER', 'mode': 'sample'}]";
  List<TableFieldSchema> list = BigQueryUtils.getSchemaFromString(fields);
  assertThat(list).hasSize(2);
  assertThat(list.get(0).getName()).isEqualTo("MyName");
  assertThat(list.get(0).getType()).isEqualTo("STRING");

  assertThat(list.get(1).getName()).isEqualTo("Number");
  assertThat(list.get(1).getType()).isEqualTo("INTEGER");
  assertThat(list.get(1).getMode()).isEqualTo("sample");
}
 
Example #10
Source File: CheckedBigqueryTest.java    From nomulus with Apache License 2.0 6 votes vote down vote up
@Before
public void before() throws Exception {
  when(bigquery.datasets()).thenReturn(bigqueryDatasets);
  when(bigqueryDatasets.insert(eq("Project-Id"), any(Dataset.class)))
      .thenReturn(bigqueryDatasetsInsert);
  when(bigquery.tables()).thenReturn(bigqueryTables);
  when(bigqueryTables.insert(eq("Project-Id"), any(String.class), any(Table.class)))
      .thenReturn(bigqueryTablesInsert);
  checkedBigquery = new CheckedBigquery();
  checkedBigquery.bigquery = bigquery;
  checkedBigquery.bigquerySchemas =
      new ImmutableMap.Builder<String, ImmutableList<TableFieldSchema>>()
          .put(
              "Table-Id",
              ImmutableList.of(new TableFieldSchema().setName("column1").setType(STRING.name())))
          .put(
              "Table2",
              ImmutableList.of(new TableFieldSchema().setName("column1").setType(STRING.name())))
          .build();
}
 
Example #11
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that BigQueryConverters.columnToValue() throws IllegalArgumentException when the BigQuery
 * column is an invalid TIMESTAMP.
 */
@Test
public void testColumnToValueTimestampInvalid() {
  TableFieldSchema column =
      new TableFieldSchema().setName(invalidTimestampField).setType("TIMESTAMP");
  Record record =
      generateSingleFieldAvroRecord(
          invalidTimestampField,
          "long",
          invalidTimestampFieldDesc,
          invalidTimestampFieldValueNanos);
  boolean isThrown = false;
  try {
    Value value = BigQueryConverters.columnToValue(column, record.get(invalidTimestampField));
  } catch (IllegalArgumentException e) {
    isThrown = true;
  }
  assertTrue(isThrown);
}
 
Example #12
Source File: CheckedBigquery.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/** Ensures the table exists in Bigquery. */
private void ensureTable(Bigquery bigquery, TableReference table, List<TableFieldSchema> schema)
    throws IOException {
  try {
    bigquery.tables().insert(table.getProjectId(), table.getDatasetId(), new Table()
        .setSchema(new TableSchema().setFields(schema))
        .setTableReference(table))
        .execute();
    logger.atInfo().log(
        "Created BigQuery table %s:%s.%s",
        table.getProjectId(), table.getDatasetId(), table.getTableId());
  } catch (IOException e) {
    // Swallow errors about a table that exists, and throw any other ones.
    if (!BigqueryJobFailureException.create(e).getReason().equals("duplicate")) {
      throw e;
    }
  }
}
 
Example #13
Source File: BigQueryInterpreter.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public static String printRows(final GetQueryResultsResponse response) {
  StringBuilder msg = new StringBuilder();
  try {
    List<String> schemNames = new ArrayList<String>();
    for (TableFieldSchema schem: response.getSchema().getFields()) {
      schemNames.add(schem.getName());
    }
    msg.append(Joiner.on(TAB).join(schemNames));
    msg.append(NEWLINE);
    for (TableRow row : response.getRows()) {
      List<String> fieldValues = new ArrayList<String>();
      for (TableCell field : row.getF()) {
        fieldValues.add(field.getV().toString());
      }
      msg.append(Joiner.on(TAB).join(fieldValues));
      msg.append(NEWLINE);
    }
    return msg.toString();
  } catch (NullPointerException ex) {
    throw new NullPointerException("SQL Execution returned an error!");
  }
}
 
Example #14
Source File: BigQueryServicesImplTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Tests that table creation succeeds when the table already exists. */
@Test
public void testCreateTableSucceedsAlreadyExists() throws IOException {
  TableReference ref =
      new TableReference().setProjectId("project").setDatasetId("dataset").setTableId("table");
  TableSchema schema =
      new TableSchema()
          .setFields(
              ImmutableList.of(
                  new TableFieldSchema().setName("column1").setType("String"),
                  new TableFieldSchema().setName("column2").setType("Integer")));
  Table testTable = new Table().setTableReference(ref).setSchema(schema);

  when(response.getStatusCode()).thenReturn(409); // 409 means already exists

  BigQueryServicesImpl.DatasetServiceImpl services =
      new BigQueryServicesImpl.DatasetServiceImpl(bigquery, PipelineOptionsFactory.create());
  Table ret =
      services.tryCreateTable(
          testTable, new RetryBoundedBackOff(0, BackOff.ZERO_BACKOFF), Sleeper.DEFAULT);

  assertNull(ret);
  verify(response, times(1)).getStatusCode();
  verify(response, times(1)).getContent();
  verify(response, times(1)).getContentType();
}
 
Example #15
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity without a valid key when a
 * field is of type Record.
 */
@Test
public void testAvroToEntityRecordField() throws Exception {
  // Create test data
  TableFieldSchema column = generateNestedTableFieldSchema();
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(column);
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Record record = generateNestedAvroRecord();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  // Assess results
  String expectedCauseMessage = String.format("Column [address] of type [RECORD] not supported.");
  assertTrue(!outputEntity.hasKey());
  assertEquals(
      expectedCauseMessage, outputEntity.getPropertiesMap().get("cause").getStringValue());
  assertEquals(record.toString(), outputEntity.getPropertiesMap().get("row").getStringValue());
}
 
Example #16
Source File: OpinionAnalysisPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * Setup step {A}
 * Helper method that defines the BigQuery schema used for the output.
 */
private static TableSchema getWebResourceSchema() {
	List<TableFieldSchema> fields = new ArrayList<>();
	fields.add(new TableFieldSchema().setName("WebResourceHash").setType("STRING").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("Url").setType("STRING"));
	fields.add(new TableFieldSchema().setName("PublicationTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("PublicationDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING"));
	fields.add(new TableFieldSchema().setName("DocumentCollectionId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("CollectionItemId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Title").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Domain").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Author").setType("STRING"));
	fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));

	TableSchema schema = new TableSchema().setFields(fields);
	return schema;
}
 
Example #17
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * Setup step {A}
 * Helper method that defines the BigQuery schema used for the output.
 */
private static TableSchema getWebResourceSchema() {
	List<TableFieldSchema> fields = new ArrayList<>();
	fields.add(new TableFieldSchema().setName("WebResourceHash").setType("STRING").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("Url").setType("STRING"));
	fields.add(new TableFieldSchema().setName("PublicationTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("PublicationDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING"));
	fields.add(new TableFieldSchema().setName("DocumentCollectionId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("CollectionItemId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Title").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Domain").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Author").setType("STRING"));
	fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));
	fields.add(new TableFieldSchema().setName("MetaFields").setType("STRING").setMode("REPEATED"));

	TableSchema schema = new TableSchema().setFields(fields);
	return schema;
}
 
Example #18
Source File: IcannReportingStager.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/** Adds a row's values to an existing list of integers (totals). */
private void addToTotal(List<Integer> totals, Map<TableFieldSchema, Object> row) {
  List<Integer> rowVals =
      row.values()
          .stream()
          // Ignore TLD, Registrar name and IANA id
          .skip(3)
          .map((Object o) -> Integer.parseInt(o.toString()))
          .collect(toImmutableList());
  checkState(
      rowVals.size() == totals.size(),
      "Number of elements in totals not equal to number of elements in row!");
  for (int i = 0; i < rowVals.size(); i++) {
    totals.set(i, totals.get(i) + rowVals.get(i));
  }
}
 
Example #19
Source File: IcannReportingStager.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/** Creates and stores activity reports on GCS, returns a list of files stored. */
private ImmutableList<String> stageActivityReports(
    YearMonth yearMonth,
    String subdir,
    String headerRow,
    ImmutableCollection<Map<TableFieldSchema, Object>> rows)
    throws IOException {
  ImmutableList.Builder<String> manifestBuilder = new ImmutableList.Builder<>();
  // Create a report csv for each tld from query table, and upload to GCS
  for (Map<TableFieldSchema, Object> row : rows) {
    // Get the tld (first cell in each row)
    String tld = row.values().iterator().next().toString();
    if (isNullOrEmpty(tld)) {
      throw new RuntimeException("Found an empty row in the activity report table!");
    }
    ImmutableList<String> rowStrings = ImmutableList.of(constructRow(row.values()));
    // Create and upload the activity report with a single row
    manifestBuilder.add(
        saveReportToGcs(
            tld, yearMonth, subdir, createReport(headerRow, rowStrings), ReportType.ACTIVITY));
  }
  return manifestBuilder.build();
}
 
Example #20
Source File: BigQuerySchemaUpdateOptionsIT.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testAllowFieldRelaxation() throws Exception {
  String tableName = makeTestTable();

  Set<SchemaUpdateOption> schemaUpdateOptions =
      EnumSet.of(BigQueryIO.Write.SchemaUpdateOption.ALLOW_FIELD_RELAXATION);

  TableSchema newSchema =
      new TableSchema()
          .setFields(
              ImmutableList.of(
                  new TableFieldSchema().setName("optional_field").setType("STRING")));

  String value = "hellooo";
  TableRow rowToInsert = new TableRow().set("optional_field", value);

  String testQuery =
      String.format("SELECT optional_field FROM [%s.%s];", BIG_QUERY_DATASET_ID, tableName);

  List<List<String>> expectedResult = Arrays.asList(Arrays.asList(value));
  runWriteTest(schemaUpdateOptions, tableName, newSchema, rowToInsert, testQuery, expectedResult);
}
 
Example #21
Source File: TemplatePipeline.java    From gcp-batch-ingestion-bigquery with Apache License 2.0 5 votes vote down vote up
private static TableSchema getTableSchema() {
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("year").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("day").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("wikimedia_project").setType("STRING"));
    fields.add(new TableFieldSchema().setName("language").setType("STRING"));
    fields.add(new TableFieldSchema().setName("title").setType("STRING"));
    fields.add(new TableFieldSchema().setName("views").setType("INTEGER"));
    return new TableSchema().setFields(fields);
}
 
Example #22
Source File: BigQueryAvroRegistry.java    From components with Apache License 2.0 5 votes vote down vote up
private TableFieldSchema tryFieldSchema(TableFieldSchema fieldSchema, org.apache.avro.Schema avroSchema) {
    fieldSchema = fieldSchema.setType(getBQFieldType(avroSchema));

    if (avroSchema.getType() == org.apache.avro.Schema.Type.RECORD) {
        List<TableFieldSchema> childFields = new ArrayList<>();
        List<org.apache.avro.Schema.Field> avroChildFields = avroSchema.getFields();
        for (org.apache.avro.Schema.Field avroChildField : avroChildFields) {
            childFields.add(tryArrayFieldSchema(avroChildField));
        }
        fieldSchema.setFields(childFields);
    }
    return fieldSchema;
}
 
Example #23
Source File: BigqueryConnection.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the result of calling queryToLocalTable, but synchronously to avoid spawning new
 * background threads, which App Engine doesn't support.
 *
 * @see <a href="https://cloud.google.com/appengine/docs/standard/java/runtime#Threads">App Engine
 *     Runtime</a>
 */
public ImmutableTable<Integer, TableFieldSchema, Object> queryToLocalTableSync(String querySql) {
  Job job = new Job()
      .setConfiguration(new JobConfiguration()
          .setQuery(new JobConfigurationQuery()
              .setQuery(querySql)
              .setDefaultDataset(getDataset())));
  return getQueryResults(runJob(job));
}
 
Example #24
Source File: FeatureSetSpecToTableSchema.java    From feast with Apache License 2.0 5 votes vote down vote up
/**
 * Convert Table schema into json-like object (prepared for serialization)
 *
 * @param schema bq table schema
 * @return json-like schema
 */
private TableSchema serializeSchema(Schema schema) {
  TableSchema tableSchema = new TableSchema();
  FieldList fields = schema.getFields();
  List<TableFieldSchema> tableFieldSchemas =
      fields.stream()
          .map(
              field -> {
                TableFieldSchema f =
                    new TableFieldSchema()
                        .setName(field.getName())
                        .setType(field.getType().name());

                if (field.getMode() != null) {
                  f.setMode(field.getMode().name());
                }

                if (field.getDescription() != null) {
                  f.setDescription(field.getDescription());
                }
                return f;
              })
          .collect(Collectors.toList());

  tableSchema.setFields(tableFieldSchemas);
  return tableSchema;
}
 
Example #25
Source File: TemplatePipeline.java    From gcp-batch-ingestion-bigquery with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
    if (c.element().equalsIgnoreCase(HEADER)) return;
    String[] split = c.element().split(",");
    if (split.length > 7) return;
    TableRow row = new TableRow();
    for (int i = 0; i < split.length; i++) {
        TableFieldSchema col = getTableSchema().getFields().get(i);
        row.set(col.getName(), split[i]);
    }
    c.output(row);
}
 
Example #26
Source File: Json.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Read a {@link Schema} from a byte array.
 *
 * <p>{@link Schema} does not natively support Jackson deserialization, so we rely on a
 * roundabout method inspired by https://github.com/googleapis/google-cloud-java/issues/2753.
 *
 * @exception IOException if {@code data} does not contain a valid {@link Schema}.
 */
public static Schema readBigQuerySchema(byte[] data) throws IOException {
  List<TableFieldSchema> fieldsList = (List<TableFieldSchema>) JSON_FACTORY //
      .createJsonParser(new String(data, Charsets.UTF_8)) //
      .parseArray(ArrayList.class, TableFieldSchema.class);
  TableSchema tableSchema = new TableSchema().setFields(fieldsList);

  try {
    return (Schema) SCHEMA_FROM_PB.invoke(null, tableSchema);
  } catch (IllegalAccessException | InvocationTargetException e) {
    throw new RuntimeException(e);
  }
}
 
Example #27
Source File: OpinionAnalysisPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * Setup step {A}
 * Helper method that defines the BigQuery schema used for the output.
 */
private static TableSchema getDocumentTableSchema() {
	List<TableFieldSchema> fields = new ArrayList<>();
	fields.add(new TableFieldSchema().setName("DocumentHash").setType("STRING").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("PublicationTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("PublicationDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingTime").setType("TIMESTAMP").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("ProcessingDateId").setType("INTEGER").setMode("REQUIRED"));
	fields.add(new TableFieldSchema().setName("DocumentCollectionId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("CollectionItemId").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Title").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Type").setType("INTEGER"));
	fields.add(new TableFieldSchema().setName("Language").setType("STRING"));
	fields.add(new TableFieldSchema().setName("ParseDepth").setType("INTEGER"));
	fields.add(new TableFieldSchema().setName("ContentLength").setType("INTEGER"));
	fields.add(new TableFieldSchema().setName("Author").setType("STRING"));
	fields.add(new TableFieldSchema().setName("Text").setType("STRING"));
	fields.add(new TableFieldSchema().setName("MainWebResourceHash").setType("STRING"));
	fields.add(new TableFieldSchema().setName("ParentWebResourceHash").setType("STRING"));
	
	List<TableFieldSchema> tagsFields = new ArrayList<>();
	tagsFields.add(new TableFieldSchema().setName("Tag").setType("STRING"));
	tagsFields.add(new TableFieldSchema().setName("Weight").setType("FLOAT"));
	tagsFields.add(new TableFieldSchema().setName("GoodAsTopic").setType("BOOLEAN"));
	fields.add(new TableFieldSchema().setName("Tags").setType("RECORD").setFields(tagsFields).setMode("REPEATED"));

	TableSchema schema = new TableSchema().setFields(fields);
	return schema;
}
 
Example #28
Source File: FieldSchemaListBuilder.java    From bigquery-etl-dataflow-sample with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a record TableSchemaField out of the given list and returns it so that it may be added to another
 * FieldSchemaListBuilder
 */
public TableFieldSchema fieldSchema(FieldSchemaListBuilder list) {
  TableFieldSchema tfs = new TableFieldSchema();
  tfs.setType("RECORD");
  tfs.setFields(list.schemaFields);
  return tfs;
}
 
Example #29
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity with a default namespace
 * when the namespace is not specified.
 */
@Test
public void testAvroToEntityDefaultNamespace() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "int", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, 1);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  AvroToEntity noNamespaceConverter =
      AvroToEntity.newBuilder()
          .setEntityKind(entityKind)
          .setUniqueNameColumn(uniqueNameColumn)
          .build();
  Entity outputEntity = noNamespaceConverter.apply(inputBqData);
  // Assess results
  assertTrue(outputEntity.hasKey());
  assertEquals("", outputEntity.getKey().getPartitionId().getNamespaceId());
}
 
Example #30
Source File: MusicBrainzTransforms.java    From bigquery-etl-dataflow-sample with Apache License 2.0 5 votes vote down vote up
private static Map<String, Object> serializeableTableSchema(Map<String, Object> current, List<TableFieldSchema> fields) {
  if (current == null) {
    current = new HashMap<String, Object>();
  }
  for (TableFieldSchema field : fields) {
    if (field.getType().equals(FieldSchemaListBuilder.RECORD)) {
      current.put(field.getName(), serializeableTableSchema(null, field.getFields()));
    } else {
      current.put(field.getName(), field.getType());
    }
  }
  return current;
}