Java Code Examples for org.apache.beam.sdk.options.PipelineOptions#setJobName()

The following examples show how to use org.apache.beam.sdk.options.PipelineOptions#setJobName() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JobNameConfiguration.java    From dbeam with Apache License 2.0 6 votes vote down vote up
public static void configureJobName(final PipelineOptions options, final String... parts) {
  try {
    options.as(ApplicationNameOptions.class).setAppName("JdbcAvroJob");
  } catch (Exception e) {
    LOGGER.warn("Unable to configure ApplicationName", e);
  }
  if (options.getJobName() == null || "auto".equals(options.getJobName())) {
    final String randomPart = Integer.toHexString(ThreadLocalRandom.current().nextInt());
    final String jobName =
        String.format(
            "dbeam-%s-%s",
            Arrays.stream(parts)
                .filter(p -> !Strings.isNullOrEmpty(p))
                .map(JobNameConfiguration::normalizeString)
                .collect(Collectors.joining("-")),
            randomPart);
    options.setJobName(jobName);
  }
}
 
Example 2
Source File: PipelineOptionsTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Parameters(name = "{index}: {0}")
public static Iterable<? extends PipelineOptions> options() {
  PipelineOptionsFactory.register(TestUnserializableOptions.class);
  PipelineOptionsFactory.register(TestDefaultOptions.class);
  PipelineOptionsFactory.register(TestOptions.class);
  PipelineOptions emptyOptions = PipelineOptionsFactory.create();

  TestUnserializableOptions withNonSerializable =
      PipelineOptionsFactory.as(TestUnserializableOptions.class);
  withNonSerializable.setUnserializable(new Object());

  TestOptions withCustomField = PipelineOptionsFactory.as(TestOptions.class);
  withCustomField.setExample(99);

  PipelineOptions withSettings = PipelineOptionsFactory.create();
  withSettings.as(ApplicationNameOptions.class).setAppName("my_app");
  withSettings.setJobName("my_job");

  PipelineOptions withParsedSettings =
      PipelineOptionsFactory.fromArgs("--jobName=my_job --appName=my_app").create();

  return ImmutableList.of(
      emptyOptions, withNonSerializable, withCustomField, withSettings, withParsedSettings);
}
 
Example 3
Source File: WordCountTimeOut1Sec.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("WordCountTimeOut1Sec");

  final Pipeline p = generateWordCountPipeline(options, inputFilePath, outputFilePath);
  final PipelineResult pr = p.run();
  final PipelineResult.State running = pr.waitUntilFinish(org.joda.time.Duration.standardSeconds(1));
  try {
    final PipelineResult.State cancelled = pr.cancel();
  } catch (final IOException e) {
    LOG.info("IOException while cancelling job");
  }
}
 
Example 4
Source File: MapReduce.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = PipelineOptionsFactory.create().as(NemoPipelineOptions.class);
  options.setRunner(NemoPipelineRunner.class);
  options.setJobName("MapReduce");

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> result = GenericSourceSink.read(p, inputFilePath)
      .apply(MapElements.<String, KV<String, Long>>via(new SimpleFunction<String, KV<String, Long>>() {
        @Override
        public KV<String, Long> apply(final String line) {
          final String[] words = line.split(" +");
          final String documentId = words[0] + "#" + words[1];
          final Long count = Long.parseLong(words[2]);
          return KV.of(documentId, count);
        }
      }))
      .apply(GroupByKey.<String, Long>create())
      .apply(Combine.<String, Long, Long>groupedValues(Sum.ofLongs()))
      .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
        @Override
        public String apply(final KV<String, Long> kv) {
          return kv.getKey() + ": " + kv.getValue();
        }
      }));
  GenericSourceSink.write(result, outputFilePath);
  p.run();
}
 
Example 5
Source File: JobNameConfigurationTest.java    From dbeam with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldConfigureJobNameWithEmptyTableName() {
  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  pipelineOptions.setJobName(null);

  JobNameConfiguration.configureJobName(pipelineOptions, "some_db", null);

  Assert.assertEquals(
      "JdbcAvroJob", pipelineOptions.as(ApplicationNameOptions.class).getAppName());

  assertThat(pipelineOptions.getJobName(), startsWith("dbeam-somedb-"));
  Assert.assertEquals(3, pipelineOptions.getJobName().split("-").length);
}
 
Example 6
Source File: JobNameConfigurationTest.java    From dbeam with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldConfigureJobNameWhenJobNameIsAuto() {
  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  pipelineOptions.setJobName("auto");

  JobNameConfiguration.configureJobName(pipelineOptions, "some_db", "some_table");

  Assert.assertEquals(
      "JdbcAvroJob", pipelineOptions.as(ApplicationNameOptions.class).getAppName());
  assertThat(pipelineOptions.getJobName(), startsWith("dbeam-somedb-sometable-"));
}
 
Example 7
Source File: JobNameConfigurationTest.java    From dbeam with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldConfigureJobName() {
  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  pipelineOptions.setJobName(null);

  JobNameConfiguration.configureJobName(pipelineOptions, "some_db", "some_table");

  Assert.assertEquals(
      "JdbcAvroJob", pipelineOptions.as(ApplicationNameOptions.class).getAppName());
  assertThat(pipelineOptions.getJobName(), startsWith("dbeam-somedb-sometable-"));
}
 
Example 8
Source File: WindowedBroadcast.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String outputFilePath = args[0];

  final Window<Long> windowFn = Window
    .<Long>into(SlidingWindows.of(Duration.standardSeconds(2))
      .every(Duration.standardSeconds(1)));

  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("WindowedBroadcast");

  final Pipeline p = Pipeline.create(options);

  final PCollection<Long> windowedElements = getSource(p).apply(windowFn);
  final PCollectionView<List<Long>> windowedView = windowedElements.apply(View.asList());

  windowedElements.apply(ParDo.of(new DoFn<Long, String>() {
      @ProcessElement
      public void processElement(final ProcessContext c) {
        final Long anElementInTheWindow = c.element();
        final List<Long> allElementsInTheWindow = c.sideInput(windowedView);
        System.out.println(anElementInTheWindow + " / " + allElementsInTheWindow);
        if (!allElementsInTheWindow.contains(anElementInTheWindow)) {
          throw new RuntimeException(anElementInTheWindow + " not in " + allElementsInTheWindow.toString());
        } else {
          c.output(anElementInTheWindow + " is in " + allElementsInTheWindow);
        }
      }
    }).withSideInputs(windowedView)
  ).apply(new WriteOneFilePerWindow(outputFilePath, 1));

  p.run().waitUntilFinish();
}
 
Example 9
Source File: WindowedWordCount.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String outputFilePath = args[0];
  final String windowType = args[1];

  final Window<KV<String, Long>> windowFn;
  if (windowType.equals("fixed")) {
    windowFn = Window.<KV<String, Long>>into(FixedWindows.of(Duration.standardSeconds(5)));
  } else {
    windowFn = Window.<KV<String, Long>>into(SlidingWindows.of(Duration.standardSeconds(10))
      .every(Duration.standardSeconds(5)));
  }

  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("WindowedWordCount");

  final Pipeline p = Pipeline.create(options);

  getSource(p, args)
    .apply(windowFn)
    .apply(Sum.longsPerKey())
    .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
      @Override
      public String apply(final KV<String, Long> kv) {
        return kv.getKey() + ": " + kv.getValue();
      }
    }))
    .apply(new WriteOneFilePerWindow(outputFilePath, 1));

  p.run().waitUntilFinish();
}
 
Example 10
Source File: WordCount.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("WordCount");

  final Pipeline p = generateWordCountPipeline(options, inputFilePath, outputFilePath);
  p.run().waitUntilFinish();
}
 
Example 11
Source File: PipelineTestBase.java    From kettle-beam with Apache License 2.0 5 votes vote down vote up
@Ignore
public void createRunPipeline( TransMeta transMeta ) throws Exception {

  /*
  FileOutputStream fos = new FileOutputStream( "/tmp/"+transMeta.getName()+".ktr" );
  fos.write( transMeta.getXML().getBytes() );
  fos.close();
  */

  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();

  pipelineOptions.setJobName( transMeta.getName() );
  pipelineOptions.setUserAgent( BeamConst.STRING_KETTLE_BEAM );

  BeamJobConfig jobConfig = new BeamJobConfig();
  jobConfig.setName("Direct runner test");
  jobConfig.setRunnerTypeName( RunnerType.Direct.name() );

  // No extra plugins to load : null option
  TransMetaPipelineConverter converter = new TransMetaPipelineConverter( transMeta, metaStore, (String) null, jobConfig );
  Pipeline pipeline = converter.createPipeline( pipelineOptions );

  PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();

  MetricResults metricResults = pipelineResult.metrics();

  MetricQueryResults allResults = metricResults.queryMetrics( MetricsFilter.builder().build() );
  for ( MetricResult<Long> result : allResults.getCounters() ) {
    System.out.println( "Name: " + result.getName() + " Attempted: " + result.getAttempted() );
  }
}
 
Example 12
Source File: KettleBeamPipelineExecutor.java    From kettle-beam with Apache License 2.0 5 votes vote down vote up
private void configureStandardOptions( BeamJobConfig config, String transformationName, PipelineOptions pipelineOptions, VariableSpace space ) {
  if ( StringUtils.isNotEmpty( transformationName ) ) {
    String sanitizedName = transformationName.replaceAll( "[^-A-Za-z0-9]", "" )
      ;
    pipelineOptions.setJobName( sanitizedName );
  }
  if ( StringUtils.isNotEmpty( config.getUserAgent() ) ) {
    String userAgent = space.environmentSubstitute( config.getUserAgent() );
    pipelineOptions.setUserAgent( userAgent );
  }
  if ( StringUtils.isNotEmpty( config.getTempLocation() ) ) {
    String tempLocation = space.environmentSubstitute( config.getTempLocation() );
    pipelineOptions.setTempLocation( tempLocation );
  }
}
 
Example 13
Source File: PipelineTestBase.java    From hop with Apache License 2.0 5 votes vote down vote up
@Ignore
public void createRunPipeline( PipelineMeta pipelineMeta ) throws Exception {

  /*
  FileOutputStream fos = new FileOutputStream( "/tmp/"+pipelineMeta.getName()+".ktr" );
  fos.write( pipelineMeta.getXML().getBytes() );
  fos.close();
  */

  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();

  pipelineOptions.setJobName( pipelineMeta.getName() );
  pipelineOptions.setUserAgent( BeamConst.STRING_HOP_BEAM );

  BeamDirectPipelineRunConfiguration beamRunConfig = new BeamDirectPipelineRunConfiguration();
  beamRunConfig.setTempLocation( System.getProperty( "java.io.tmpdir" ) );

  // No extra plugins to load : null option
  HopPipelineMetaToBeamPipelineConverter converter = new HopPipelineMetaToBeamPipelineConverter( pipelineMeta, metadataProvider, beamRunConfig );
  Pipeline pipeline = converter.createPipeline();

  PipelineResult pipelineResult = pipeline.run();
  pipelineResult.waitUntilFinish();

  MetricResults metricResults = pipelineResult.metrics();

  MetricQueryResults allResults = metricResults.queryMetrics( MetricsFilter.builder().build() );
  for ( MetricResult<Long> result : allResults.getCounters() ) {
    System.out.println( "Name: " + result.getName() + " Attempted: " + result.getAttempted() );
  }
}
 
Example 14
Source File: PartitionWordsByLength.java    From incubator-nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Main function for the MR BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("PartitionWordsByLength");

  // {} here is required for preserving type information.
  // Please see https://stackoverflow.com/a/48431397 for details.
  final TupleTag<KV<Integer, String>> shortWordsTag = new TupleTag<KV<Integer, String>>("short") {
  };
  final TupleTag<KV<Integer, String>> longWordsTag = new TupleTag<KV<Integer, String>>("long") {
  };
  final TupleTag<String> veryLongWordsTag = new TupleTag<String>("very long") {
  };
  final TupleTag<String> veryVeryLongWordsTag = new TupleTag<String>("very very long") {
  };

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> lines = GenericSourceSink.read(p, inputFilePath);

  PCollectionTuple results = lines
    .apply(FlatMapElements
      .into(TypeDescriptors.strings())
      .via(line -> Arrays.asList(line.split(" "))))
    .apply(ParDo.of(new DoFn<String, String>() {
      // processElement with Beam OutputReceiver.
      @ProcessElement
      public void processElement(final ProcessContext c) {
        final String word = c.element();
        if (word.length() < 6) {
          c.output(shortWordsTag, KV.of(word.length(), word));
        } else if (word.length() < 11) {
          c.output(longWordsTag, KV.of(word.length(), word));
        } else if (word.length() > 12) {
          c.output(veryVeryLongWordsTag, word);
        } else {
          c.output(word);
        }
      }
    }).withOutputTags(veryLongWordsTag, TupleTagList
      .of(shortWordsTag).and(longWordsTag).and(veryVeryLongWordsTag)));

  PCollection<String> shortWords = results.get(shortWordsTag)
    .apply(GroupByKey.create())
    .apply(MapElements.via(new FormatLines()));
  PCollection<String> longWords = results.get(longWordsTag)
    .apply(GroupByKey.create())
    .apply(MapElements.via(new FormatLines()));
  PCollection<String> veryLongWords = results.get(veryLongWordsTag);
  PCollection<String> veryVeryLongWords = results.get(veryVeryLongWordsTag);

  GenericSourceSink.write(shortWords, outputFilePath + "_short");
  GenericSourceSink.write(longWords, outputFilePath + "_long");
  GenericSourceSink.write(veryLongWords, outputFilePath + "_very_long");
  GenericSourceSink.write(veryVeryLongWords, outputFilePath + "_very_very_long");
  p.run().waitUntilFinish();
}
 
Example 15
Source File: AlternatingLeastSquareInefficient.java    From incubator-nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Main function for the ALS BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final Long start = System.currentTimeMillis();
  LOG.info(Arrays.toString(args));
  final String inputFilePath = args[0];
  final Integer numFeatures = Integer.parseInt(args[1]);
  final Integer numItr = Integer.parseInt(args[2]);
  final Double lambda;
  if (args.length > 4) {
    lambda = Double.parseDouble(args[3]);
  } else {
    lambda = 0.05;
  }

  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("ALS");
  options.setStableUniqueNames(PipelineOptions.CheckEnabled.OFF);

  final Pipeline p = Pipeline.create(options);
  p.getCoderRegistry().registerCoderProvider(CoderProviders.fromStaticMethods(int[].class, IntArrayCoder.class));
  p.getCoderRegistry().registerCoderProvider(CoderProviders.fromStaticMethods(float[].class, FloatArrayCoder.class));

  // Read raw data
  final PCollection<String> rawData = GenericSourceSink.read(p, inputFilePath);

  // Parse data for item
  final PCollection<KV<Integer, KV<int[], float[]>>> parsedItemData = rawData
    .apply(ParDo.of(new AlternatingLeastSquare.ParseLine(false)))
    .apply(Combine.perKey(new AlternatingLeastSquare.TrainingDataCombiner()));

  // Create Initial Item Matrix
  PCollection<KV<Integer, float[]>> itemMatrix = parsedItemData
    .apply(ParDo.of(new DoFn<KV<Integer, KV<int[], float[]>>, KV<Integer, float[]>>() {
      @ProcessElement
      public void processElement(final ProcessContext c) throws Exception {
        final float[] result = new float[numFeatures];

        final KV<Integer, KV<int[], float[]>> element = c.element();
        final float[] ratings = element.getValue().getValue();
        for (int i = 0; i < ratings.length; i++) {
          result[0] += ratings[i];
        }

        result[0] /= ratings.length;
        for (int i = 1; i < result.length; i++) {
          result[i] = (float) (Math.random() * 0.01);
        }
        c.output(KV.of(element.getKey(), result));
      }
    }));

  // Iterations to update Item Matrix.
  for (Integer i = 0; i < numItr; i++) {
    // NOTE: a single composite transform for the iteration.
    itemMatrix = itemMatrix.apply(new UpdateUserAndItemMatrix(numFeatures, lambda, rawData, parsedItemData));
  }

  p.run().waitUntilFinish();
  LOG.info("JCT " + (System.currentTimeMillis() - start));
}
 
Example 16
Source File: WriteRowMetricsDoFnTest.java    From feast with Apache License 2.0 4 votes vote down vote up
@Test
public void shouldSendCorrectStatsDMetrics() throws IOException, InterruptedException {
  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  pipelineOptions.setJobName("job");
  Map<String, Iterable<FeatureRow>> input =
      readTestInput("feast/ingestion/transform/WriteRowMetricsDoFnTest.input");
  List<String> expectedLines =
      readTestOutput("feast/ingestion/transform/WriteRowMetricsDoFnTest.output");

  pipeline
      .apply(Create.of(input))
      .apply(
          ParDo.of(
              WriteRowMetricsDoFn.newBuilder()
                  .setStatsdHost("localhost")
                  .setStatsdPort(STATSD_SERVER_PORT)
                  .setStoreName("store")
                  .setClock(Clock.fixed(Instant.ofEpochSecond(1585548645), ZoneId.of("UTC")))
                  .setMetricsNamespace("test")
                  .build()));
  pipeline.run(pipelineOptions).waitUntilFinish();
  // Wait until StatsD has finished processed all messages, 3 sec is a reasonable duration
  // based on empirical testing.
  Thread.sleep(3000);

  List<String> actualLines = statsDServer.messagesReceived();
  for (String expected : expectedLines) {
    boolean matched = false;
    for (String actual : actualLines) {
      if (actual.equals(expected)) {
        matched = true;
        break;
      }
    }
    if (!matched) {
      System.out.println("Print actual metrics output for debugging:");
      for (String line : actualLines) {
        System.out.println(line);
      }
      fail(String.format("Expected StatsD metric not found:\n%s", expected));
    }
  }
  statsDServer.stop();
}
 
Example 17
Source File: WriteFeatureValueMetricsDoFnTest.java    From feast with Apache License 2.0 4 votes vote down vote up
@Test
public void shouldSendCorrectStatsDMetrics() throws IOException, InterruptedException {
  PipelineOptions pipelineOptions = PipelineOptionsFactory.create();
  pipelineOptions.setJobName("job");

  Map<String, Iterable<FeatureRow>> input =
      readTestInput("feast/ingestion/transform/WriteFeatureValueMetricsDoFnTest.input");
  List<String> expectedLines =
      readTestOutput("feast/ingestion/transform/WriteFeatureValueMetricsDoFnTest.output");

  pipeline
      .apply(Create.of(input))
      .apply(
          ParDo.of(
              WriteFeatureValueMetricsDoFn.newBuilder()
                  .setStatsdHost("localhost")
                  .setStatsdPort(STATSD_SERVER_PORT)
                  .setStoreName("store")
                  .setMetricsNamespace("test")
                  .build()));
  pipeline.run(pipelineOptions).waitUntilFinish();
  // Wait until StatsD has finished processed all messages, 3 sec is a reasonable duration
  // based on empirical testing.
  Thread.sleep(3000);

  List<String> actualLines = statsDServer.messagesReceived();
  for (String expected : expectedLines) {
    boolean matched = false;
    for (String actual : actualLines) {
      if (actual.equals(expected)) {
        matched = true;
        break;
      }
    }
    if (!matched) {
      System.out.println("Print actual metrics output for debugging:");
      for (String line : actualLines) {
        System.out.println(line);
      }
      fail(String.format("Expected StatsD metric not found:\n%s", expected));
    }
  }
  statsDServer.stop();
}
 
Example 18
Source File: AlternatingLeastSquare.java    From incubator-nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Main function for the ALS BEAM program.
 *
 * @param args arguments.
 * @throws ClassNotFoundException exception.
 */
public static void main(final String[] args) {
  final Long start = System.currentTimeMillis();
  LOG.info(Arrays.toString(args));
  final String inputFilePath = args[0];
  final Integer numFeatures = Integer.parseInt(args[1]);
  final Integer numItr = Integer.parseInt(args[2]);
  final Double lambda;
  if (args.length > 3) {
    lambda = Double.parseDouble(args[3]);
  } else {
    lambda = 0.05;
  }
  final String outputFilePath;
  boolean checkOutput = false;
  if (args.length > 4) {
    outputFilePath = args[4];
    checkOutput = true;
  } else {
    outputFilePath = "";
  }

  final PipelineOptions options = NemoPipelineOptionsFactory.create();
  options.setJobName("ALS");
  options.setStableUniqueNames(PipelineOptions.CheckEnabled.OFF);

  final Pipeline p = Pipeline.create(options);
  p.getCoderRegistry().registerCoderProvider(CoderProviders.fromStaticMethods(int[].class, IntArrayCoder.class));
  p.getCoderRegistry().registerCoderProvider(CoderProviders.fromStaticMethods(float[].class, FloatArrayCoder.class));

  // Read raw data
  final PCollection<String> rawData = GenericSourceSink.read(p, inputFilePath);

  // Parse data for item
  final PCollection<KV<Integer, KV<int[], float[]>>> parsedItemData = rawData
    .apply(ParDo.of(new ParseLine(false)))
    .apply(Combine.perKey(new TrainingDataCombiner()));

  // Parse data for user
  final PCollection<KV<Integer, KV<int[], float[]>>> parsedUserData = rawData
    .apply(ParDo.of(new ParseLine(true)))
    .apply(Combine.perKey(new TrainingDataCombiner()));

  // Create Initial Item Matrix
  PCollection<KV<Integer, float[]>> itemMatrix =
    parsedItemData.apply(ParDo.of(new CreateInitialMatrix(numFeatures, checkOutput)));

  // Iterations to update Item Matrix.
  for (int i = 0; i < numItr; i++) {
    // NOTE: a single composite transform for the iteration.
    itemMatrix = itemMatrix.apply(new UpdateUserAndItemMatrix(numFeatures, lambda, parsedUserData, parsedItemData));
  }

  if (checkOutput) {
    final PCollection<String> result = itemMatrix.apply(MapElements.<KV<Integer, float[]>, String>via(
      new SimpleFunction<KV<Integer, float[]>, String>() {
        @Override
        public String apply(final KV<Integer, float[]> elem) {
          final List<String> values = Stream.of(ArrayUtils.toObject(elem.getValue()))
            .map(String::valueOf)
            .collect(Collectors.toList());
          return elem.getKey() + "," + String.join(",", values);
        }
      }));

    GenericSourceSink.write(result, outputFilePath);
  }

  p.run().waitUntilFinish();
  LOG.info("JCT " + (System.currentTimeMillis() - start));
}
 
Example 19
Source File: AlternatingLeastSquareInefficient.java    From nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Main function for the ALS BEAM program.
 * @param args arguments.
 */
public static void main(final String[] args) {
  final Long start = System.currentTimeMillis();
  LOG.info(Arrays.toString(args));
  final String inputFilePath = args[0];
  final Integer numFeatures = Integer.parseInt(args[1]);
  final Integer numItr = Integer.parseInt(args[2]);
  final Double lambda;
  if (args.length > 4) {
    lambda = Double.parseDouble(args[3]);
  } else {
    lambda = 0.05;
  }

  final PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(NemoPipelineRunner.class);
  options.setJobName("ALS");
  options.setStableUniqueNames(PipelineOptions.CheckEnabled.OFF);

  final Pipeline p = Pipeline.create(options);
  p.getCoderRegistry().registerCoderProvider(CoderProviders.fromStaticMethods(Pair.class, PairCoder.class));

  // Read raw data
  final PCollection<String> rawData = GenericSourceSink.read(p, inputFilePath);

  // Parse data for item
  final PCollection<KV<Integer, Pair<List<Integer>, List<Double>>>> parsedItemData = rawData
      .apply(ParDo.of(new AlternatingLeastSquare.ParseLine(false)))
      .apply(Combine.perKey(new AlternatingLeastSquare.TrainingDataCombiner()));

  // Create Initial Item Matrix
  PCollection<KV<Integer, List<Double>>> itemMatrix = parsedItemData
      .apply(ParDo.of(new DoFn<KV<Integer, Pair<List<Integer>, List<Double>>>, KV<Integer, List<Double>>>() {
        @ProcessElement
        public void processElement(final ProcessContext c) throws Exception {
          final List<Double> result = new ArrayList<>(numFeatures);
          result.add(0, 0.0);

          final KV<Integer, Pair<List<Integer>, List<Double>>> element = c.element();
          final List<Double> ratings = element.getValue().right();
          for (Integer i = 0; i < ratings.size(); i++) {
            result.set(0, result.get(0) + ratings.get(i));
          }

          result.set(0, result.get(0) / ratings.size());
          for (Integer i = 1; i < result.size(); i++) {
            result.add(i, (Math.random() * 0.01));
          }
          c.output(KV.of(element.getKey(), result));
        }
      }));

  // Iterations to update Item Matrix.
  for (Integer i = 0; i < numItr; i++) {
    // NOTE: a single composite transform for the iteration.
    itemMatrix = itemMatrix.apply(new UpdateUserAndItemMatrix(numFeatures, lambda, rawData, parsedItemData));
  }

  p.run();
  LOG.info("JCT " + (System.currentTimeMillis() - start));
}
 
Example 20
Source File: HopPipelineMetaToBeamPipelineConverter.java    From hop with Apache License 2.0 4 votes vote down vote up
public Pipeline createPipeline() throws Exception {

    ILogChannel log = LogChannel.GENERAL;

    // Create a new Pipeline
    //
    RunnerType runnerType = pipelineRunConfiguration.getRunnerType();
    Class<? extends PipelineRunner<?>> runnerClass = getPipelineRunnerClass( runnerType );

    PipelineOptions pipelineOptions = pipelineRunConfiguration.getPipelineOptions();
    // The generic options
    //
    pipelineOptions.setUserAgent( pipelineRunConfiguration.environmentSubstitute( pipelineRunConfiguration.getUserAgent() ) );
    pipelineOptions.setTempLocation( pipelineRunConfiguration.environmentSubstitute( pipelineRunConfiguration.getTempLocation() ) );
    pipelineOptions.setJobName( pipelineMeta.getName() );

    pipelineOptions.setRunner( runnerClass );
    Pipeline pipeline = Pipeline.create( pipelineOptions );

    pipeline.getCoderRegistry().registerCoderForClass( HopRow.class, new HopRowCoder() );

    log.logBasic( "Created Apache Beam pipeline with name '" + pipelineOptions.getJobName() + "'" );

    // Keep track of which transform outputs which Collection
    //
    Map<String, PCollection<HopRow>> stepCollectionMap = new HashMap<>();

    // Handle io
    //
    handleBeamInputSteps( log, stepCollectionMap, pipeline );

    // Transform all the other transforms...
    //
    handleGenericStep( stepCollectionMap, pipeline );

    // Output handling
    //
    handleBeamOutputSteps( log, stepCollectionMap, pipeline );

    return pipeline;
  }