Java Code Examples for org.apache.hadoop.io.SequenceFile.Writer#append()

The following examples show how to use org.apache.hadoop.io.SequenceFile.Writer#append() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TarUnpackerSequenceFileWriter.java    From localization_nifi with Apache License 2.0 7 votes vote down vote up
@Override
protected void processInputStream(final InputStream stream, final FlowFile tarArchivedFlowFile, final Writer writer) throws IOException {
    try (final TarArchiveInputStream tarIn = new TarArchiveInputStream(new BufferedInputStream(stream))) {
        TarArchiveEntry tarEntry;
        while ((tarEntry = tarIn.getNextTarEntry()) != null) {
            if (tarEntry.isDirectory()) {
                continue;
            }
            final String key = tarEntry.getName();
            final long fileSize = tarEntry.getSize();
            final InputStreamWritable inStreamWritable = new InputStreamWritable(tarIn, (int) fileSize);
            writer.append(new Text(key), inStreamWritable);
            logger.debug("Appending FlowFile {} to Sequence File", new Object[]{key});
        }
    }
}
 
Example 2
Source File: SequenceFileUtil.java    From alchemy with Apache License 2.0 6 votes vote down vote up
public static void writeSequenceFile(String path) throws Exception{
	Writer.Option filePath = Writer.file(new Path(path));
	Writer.Option keyClass = Writer.keyClass(IntWritable.class);
	Writer.Option valueClass = Writer.valueClass(Text.class);
	Writer.Option compression = Writer.compression(CompressionType.NONE);
	Writer writer = SequenceFile.createWriter(configuration, filePath, keyClass, valueClass, compression);
	IntWritable key = new IntWritable();
	Text value = new Text("");
	for(int i=0;i<100;i++){
		key.set(i);
		value.set("value_"+i);
		writer.append(key, value);
	}
	writer.hflush();
	writer.close();
}
 
Example 3
Source File: ZipUnpackerSequenceFileWriter.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Override
protected void processInputStream(InputStream stream, final FlowFile flowFile, final Writer writer) throws IOException {

    try (final ZipInputStream zipIn = new ZipInputStream(new BufferedInputStream(stream))) {
        ZipEntry zipEntry;
        while ((zipEntry = zipIn.getNextEntry()) != null) {
            if (zipEntry.isDirectory()) {
                continue;
            }
            final File file = new File(zipEntry.getName());
            final String key = file.getName();
            long fileSize = zipEntry.getSize();
            final InputStreamWritable inStreamWritable = new InputStreamWritable(zipIn, (int) fileSize);
            writer.append(new Text(key), inStreamWritable);
            logger.debug("Appending FlowFile {} to Sequence File", new Object[]{key});
        }
    }
}
 
Example 4
Source File: TestSequenceFileSerialization.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public void testJavaSerialization() throws Exception {
  Path file = new Path(System.getProperty("test.build.data",".") +
      "/test.seq");
  
  fs.delete(file, true);
  Writer writer = SequenceFile.createWriter(fs, conf, file, Long.class,
      String.class);
  
  writer.append(1L, "one");
  writer.append(2L, "two");
  
  writer.close();
  
  Reader reader = new Reader(fs, file, conf);
  assertEquals(1L, reader.next((Object) null));
  assertEquals("one", reader.getCurrentValue((Object) null));
  assertEquals(2L, reader.next((Object) null));
  assertEquals("two", reader.getCurrentValue((Object) null));
  assertNull(reader.next((Object) null));
  reader.close();
  
}
 
Example 5
Source File: SnapshotIndexDeletionPolicy.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
private synchronized void storeGenerations() throws IOException {
  FileSystem fileSystem = _path.getFileSystem(_configuration);
  FileStatus[] listStatus = fileSystem.listStatus(_path);
  SortedSet<FileStatus> existing = new TreeSet<FileStatus>(Arrays.asList(listStatus));
  long currentFile;
  if (!existing.isEmpty()) {
    FileStatus last = existing.last();
    currentFile = Long.parseLong(last.getPath().getName());
  } else {
    currentFile = 0;
  }
  Path path = new Path(_path, buffer(currentFile + 1));
  LOG.info("Creating new snapshot file [{0}]", path);
  FSDataOutputStream outputStream = fileSystem.create(path, false);
  Writer writer = SequenceFile.createWriter(_configuration, outputStream, Text.class, LongWritable.class,
      CompressionType.NONE, null);
  for (Entry<String, Long> e : _namesToGenerations.entrySet()) {
    writer.append(new Text(e.getKey()), new LongWritable(e.getValue()));
  }
  writer.close();
  outputStream.close();
  cleanupOldFiles(fileSystem, existing);
}
 
Example 6
Source File: MergeSortRowIdMatcher.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
private void writeRowIds(Writer writer, SegmentReader segmentReader) throws IOException {
  Terms terms = segmentReader.terms(BlurConstants.ROW_ID);
  if (terms == null) {
    return;
  }
  TermsEnum termsEnum = terms.iterator(null);
  BytesRef rowId;
  long s = System.nanoTime();
  while ((rowId = termsEnum.next()) != null) {
    long n = System.nanoTime();
    if (n + _10_SECONDS > s) {
      _progressable.progress();
      s = System.nanoTime();
    }
    writer.append(new Text(rowId.utf8ToString()), NullWritable.get());
  }
}
 
Example 7
Source File: TestSequenceFileSerialization.java    From big-c with Apache License 2.0 6 votes vote down vote up
public void testJavaSerialization() throws Exception {
  Path file = new Path(System.getProperty("test.build.data",".") +
      "/testseqser.seq");
  
  fs.delete(file, true);
  Writer writer = SequenceFile.createWriter(fs, conf, file, Long.class,
      String.class);
  
  writer.append(1L, "one");
  writer.append(2L, "two");
  
  writer.close();
  
  Reader reader = new Reader(fs, file, conf);
  assertEquals(1L, reader.next((Object) null));
  assertEquals("one", reader.getCurrentValue((Object) null));
  assertEquals(2L, reader.next((Object) null));
  assertEquals("two", reader.getCurrentValue((Object) null));
  assertNull(reader.next((Object) null));
  reader.close();
  
}
 
Example 8
Source File: TestSequenceFileSerialization.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public void testJavaSerialization() throws Exception {
  Path file = new Path(System.getProperty("test.build.data",".") +
      "/testseqser.seq");
  
  fs.delete(file, true);
  Writer writer = SequenceFile.createWriter(fs, conf, file, Long.class,
      String.class);
  
  writer.append(1L, "one");
  writer.append(2L, "two");
  
  writer.close();
  
  Reader reader = new Reader(fs, file, conf);
  assertEquals(1L, reader.next((Object) null));
  assertEquals("one", reader.getCurrentValue((Object) null));
  assertEquals(2L, reader.next((Object) null));
  assertEquals("two", reader.getCurrentValue((Object) null));
  assertNull(reader.next((Object) null));
  reader.close();
  
}
 
Example 9
Source File: ZipUnpackerSequenceFileWriter.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
@Override
protected void processInputStream(InputStream stream, final FlowFile flowFile, final Writer writer) throws IOException {

    try (final ZipInputStream zipIn = new ZipInputStream(new BufferedInputStream(stream))) {
        ZipEntry zipEntry;
        while ((zipEntry = zipIn.getNextEntry()) != null) {
            if (zipEntry.isDirectory()) {
                continue;
            }
            final File file = new File(zipEntry.getName());
            final String key = file.getName();
            long fileSize = zipEntry.getSize();
            final InputStreamWritable inStreamWritable = new InputStreamWritable(zipIn, (int) fileSize);
            writer.append(new Text(key), inStreamWritable);
            logger.debug("Appending FlowFile {} to Sequence File", new Object[]{key});
        }
    }
}
 
Example 10
Source File: TarUnpackerSequenceFileWriter.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Override
protected void processInputStream(final InputStream stream, final FlowFile tarArchivedFlowFile, final Writer writer) throws IOException {
    try (final TarArchiveInputStream tarIn = new TarArchiveInputStream(new BufferedInputStream(stream))) {
        TarArchiveEntry tarEntry;
        while ((tarEntry = tarIn.getNextTarEntry()) != null) {
            if (tarEntry.isDirectory()) {
                continue;
            }
            final String key = tarEntry.getName();
            final long fileSize = tarEntry.getSize();
            final InputStreamWritable inStreamWritable = new InputStreamWritable(tarIn, (int) fileSize);
            writer.append(new Text(key), inStreamWritable);
            logger.debug("Appending FlowFile {} to Sequence File", new Object[]{key});
        }
    }
}
 
Example 11
Source File: TestHSync.java    From big-c with Apache License 2.0 5 votes vote down vote up
/** Test hsync via SequenceFiles */
@Test
public void testSequenceFileSync() throws Exception {
  Configuration conf = new HdfsConfiguration();
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();

  final FileSystem fs = cluster.getFileSystem();
  final Path p = new Path("/testSequenceFileSync/foo");
  final int len = 1 << 16;
  FSDataOutputStream out = fs.create(p, FsPermission.getDefault(),
      EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE, CreateFlag.SYNC_BLOCK),
      4096, (short) 1, len, null);
  Writer w = SequenceFile.createWriter(new Configuration(),
      Writer.stream(out),
      Writer.keyClass(RandomDatum.class),
      Writer.valueClass(RandomDatum.class),
      Writer.compression(CompressionType.NONE, new DefaultCodec()));
  w.hflush();
  checkSyncMetric(cluster, 0);
  w.hsync();
  checkSyncMetric(cluster, 1);
  int seed = new Random().nextInt();
  RandomDatum.Generator generator = new RandomDatum.Generator(seed);
  generator.next();
  w.append(generator.getKey(), generator.getValue());
  w.hsync();
  checkSyncMetric(cluster, 2);
  w.close();
  checkSyncMetric(cluster, 2);
  out.close();
  checkSyncMetric(cluster, 3);
  cluster.shutdown();
}
 
Example 12
Source File: TestHSync.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/** Test hsync via SequenceFiles */
@Test
public void testSequenceFileSync() throws Exception {
  Configuration conf = new HdfsConfiguration();
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();

  final FileSystem fs = cluster.getFileSystem();
  final Path p = new Path("/testSequenceFileSync/foo");
  final int len = 1 << 16;
  FSDataOutputStream out = fs.create(p, FsPermission.getDefault(),
      EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE, CreateFlag.SYNC_BLOCK),
      4096, (short) 1, len, null);
  Writer w = SequenceFile.createWriter(new Configuration(),
      Writer.stream(out),
      Writer.keyClass(RandomDatum.class),
      Writer.valueClass(RandomDatum.class),
      Writer.compression(CompressionType.NONE, new DefaultCodec()));
  w.hflush();
  checkSyncMetric(cluster, 0);
  w.hsync();
  checkSyncMetric(cluster, 1);
  int seed = new Random().nextInt();
  RandomDatum.Generator generator = new RandomDatum.Generator(seed);
  generator.next();
  w.append(generator.getKey(), generator.getValue());
  w.hsync();
  checkSyncMetric(cluster, 2);
  w.close();
  checkSyncMetric(cluster, 2);
  out.close();
  checkSyncMetric(cluster, 3);
  cluster.shutdown();
}
 
Example 13
Source File: CopySeq.java    From Kylin with Apache License 2.0 5 votes vote down vote up
public static void copyTo64MB(String src, String dst) throws IOException {
    Configuration hconf = new Configuration();
    Path srcPath = new Path(src);
    Path dstPath = new Path(dst);

    FileSystem fs = FileSystem.get(hconf);
    long srcSize = fs.getFileStatus(srcPath).getLen();
    int copyTimes = (int) (67108864 / srcSize); // 64 MB
    System.out.println("Copy " + copyTimes + " times");

    Reader reader = new Reader(hconf, SequenceFile.Reader.file(srcPath));
    Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), hconf);
    Text value = new Text();

    Writer writer = SequenceFile.createWriter(hconf, Writer.file(dstPath), Writer.keyClass(key.getClass()), Writer.valueClass(Text.class), Writer.compression(CompressionType.BLOCK, getLZOCodec(hconf)));

    int count = 0;
    while (reader.next(key, value)) {
        for (int i = 0; i < copyTimes; i++) {
            writer.append(key, value);
            count++;
        }
    }

    System.out.println("Len: " + writer.getLength());
    System.out.println("Rows: " + count);

    reader.close();
    writer.close();
}
 
Example 14
Source File: DriverTest.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private void generateData(String mrIncWorkingPathStr, String rowId, String recordId, String value) throws IOException {
  Path path = new Path(new Path(mrIncWorkingPathStr), "new");
  Writer writer = new SequenceFile.Writer(miniCluster.getFileSystem(), conf, new Path(path, UUID.randomUUID()
      .toString()), Text.class, BlurRecord.class);
  BlurRecord blurRecord = new BlurRecord();
  blurRecord.setRowId(rowId);
  blurRecord.setRecordId(recordId);
  blurRecord.setFamily("fam0");
  blurRecord.addColumn("col0", value);
  writer.append(new Text(rowId), blurRecord);
  writer.close();
}
 
Example 15
Source File: BlurHiveOutputFormat.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getMrWorkingPathWriter(
    final Configuration configuration) throws IOException {
  PrivilegedExceptionAction<org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter> privilegedExceptionAction = new PrivilegedExceptionAction<org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter>() {
    @Override
    public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter run() throws Exception {
      String workingPathStr = configuration.get(BlurConstants.BLUR_BULK_UPDATE_WORKING_PATH);
      Path workingPath = new Path(workingPathStr);
      Path tmpDir = new Path(workingPath, "tmp");
      FileSystem fileSystem = tmpDir.getFileSystem(configuration);
      String loadId = configuration.get(BlurSerDe.BLUR_MR_LOAD_ID);
      Path loadPath = new Path(tmpDir, loadId);
      final Writer writer = new SequenceFile.Writer(fileSystem, configuration, new Path(loadPath, UUID.randomUUID()
          .toString()), Text.class, BlurRecord.class);

      return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() {

        @Override
        public void write(Writable w) throws IOException {
          BlurRecord blurRecord = (BlurRecord) w;
          String rowId = blurRecord.getRowId();
          writer.append(new Text(rowId), blurRecord);
        }

        @Override
        public void close(boolean abort) throws IOException {
          writer.close();
        }
      };
    }
  };

  UserGroupInformation userGroupInformation = getUGI(configuration);
  try {
    return userGroupInformation.doAs(privilegedExceptionAction);
  } catch (InterruptedException e) {
    throw new IOException(e);
  }
}
 
Example 16
Source File: SparkUimaUtils.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
public static void createSequenceFile(Object[] params, String uri)
    throws URISyntaxException, IOException, UIMAException, NoSuchMethodException, MissingSettingException, ClassNotFoundException {
  Configuration conf = new Configuration();
  Path path = new Path(uri);
  Writer writer =
      SequenceFile.createWriter(
          conf, Writer.file(path),
          Writer.keyClass(Text.class),
          Writer.valueClass(SCAS.class));

  int count = 0;

  CollectionReaderDescription readerDescription = Reader.getCollectionReaderDescription(Reader.COLLECTION_FORMAT.NYT, params);
  for (JCas jCas : SimplePipelineCasPoolIterator.iteratePipeline(20, readerDescription)) {
      if(JCasUtil.exists(jCas, DocumentMetaData.class)) {
        ++count;
        // Get the ID.
        DocumentMetaData dmd = JCasUtil.selectSingle(jCas, DocumentMetaData.class);
        String docId = "NULL";
        if (dmd != null) {
          docId = dmd.getDocumentId();
        } else {
          throw new IOException("No Document ID for xml: " + jCas.getView("xml").getDocumentText());
        }
        Text docIdText = new Text(docId);
        SCAS scas = new SCAS(jCas.getCas());
        writer.append(docIdText, scas);
      }
      jCas.release();
  }
  logger.info("Wrote " + count + " documents to " + uri);
  IOUtils.closeStream(writer);
}
 
Example 17
Source File: ConvertFastaForCloud.java    From emr-sample-apps with Apache License 2.0 4 votes vote down vote up
public static void saveSequence(int id, StringBuilder sequence, Writer writer) throws IOException
{	
	int fulllength = sequence.length();
	int maxchunk = 65535;
	
	if (fulllength < min_seq_len) { min_seq_len = fulllength; }
	if (fulllength > max_seq_len) { max_seq_len = fulllength; }
	
	if (fulllength > 100)
	{
		System.out.println("In " + id + "... "  + fulllength + "bp");
	}
	
	int offset = 0;
	int numchunks = 0;
	
	while(offset < fulllength)
	{
		numchunks++;
		int end = min(offset + maxchunk, fulllength);
		
		boolean lastChunk = (end == fulllength);

		record.m_sequence = DNAString.stringToBytes(sequence.substring(offset, end));
		record.m_offset = offset;
		record.m_lastChunk = lastChunk;
		
		iw.set(id);
		writer.append(iw, record.toBytes());
		
		if (end == fulllength) 
		{ 
			offset = fulllength; 
		}
		else
		{
			offset = end - cloudBurst.CloudBurst.CHUNK_OVERLAP;
		}
	}
	
	if (numchunks > 1)
	{
		System.out.println("  " + numchunks + " chunks");
	}
}
 
Example 18
Source File: SequenceFileWriterImpl.java    From nifi with Apache License 2.0 4 votes vote down vote up
protected void processInputStream(InputStream stream, FlowFile flowFile, final Writer writer) throws IOException {
    int fileSize = (int) flowFile.getSize();
    final InputStreamWritable inStreamWritable = new InputStreamWritable(new BufferedInputStream(stream), fileSize);
    String key = flowFile.getAttribute(CoreAttributes.FILENAME.key());
    writer.append(new Text(key), inStreamWritable);
}
 
Example 19
Source File: SequenceFileWriterImpl.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
protected void processInputStream(InputStream stream, FlowFile flowFile, final Writer writer) throws IOException {
    int fileSize = (int) flowFile.getSize();
    final InputStreamWritable inStreamWritable = new InputStreamWritable(new BufferedInputStream(stream), fileSize);
    String key = flowFile.getAttribute(CoreAttributes.FILENAME.key());
    writer.append(new Text(key), inStreamWritable);
}
 
Example 20
Source File: BulkIngestMapFileLoader.java    From datawave with Apache License 2.0 4 votes vote down vote up
private void writeStats(Path[] jobDirectories) throws IOException {
    if (!INGEST_METRICS) {
        log.info("ingest metrics disabled");
    } else {
        long now = System.currentTimeMillis();
        for (Path p : jobDirectories)
            reporter.getCounter("MapFileLoader.EndTimes", p.getName()).increment(now);
        // Write out the metrics.
        // We are going to serialize the counters into a file in HDFS.
        // The context was set in the processKeyValues method below, and should not be null. We'll guard against NPE anyway
        FileSystem fs = getFileSystem(seqFileHdfs);
        RawLocalFileSystem rawFS = new RawLocalFileSystem();
        rawFS.setConf(conf);
        CompressionCodec cc = new GzipCodec();
        CompressionType ct = CompressionType.BLOCK;
        
        Counters c = reporter.getCounters();
        if (null != c && c.countCounters() > 0) {
            // Serialize the counters to a file in HDFS.
            Path src = new Path(File.createTempFile("MapFileLoader", ".metrics").getAbsolutePath());
            Writer writer = SequenceFile.createWriter(conf, Writer.file(rawFS.makeQualified(src)), Writer.keyClass(NullWritable.class),
                            Writer.valueClass(Counters.class), Writer.compression(ct, cc));
            writer.append(NullWritable.get(), c);
            writer.close();
            
            // Now we will try to move the file to HDFS.
            // Copy the file to the temp dir
            try {
                Path mDir = new Path(workDir, "MapFileLoaderMetrics");
                if (!fs.exists(mDir))
                    fs.mkdirs(mDir);
                Path dst = new Path(mDir, src.getName());
                log.info("Copying file " + src + " to " + dst);
                fs.copyFromLocalFile(false, true, src, dst);
                // If this worked, then remove the local file
                rawFS.delete(src, false);
                // also remove the residual crc file
                rawFS.delete(getCrcFile(src), false);
            } catch (IOException e) {
                // If an error occurs in the copy, then we will leave in the local metrics directory.
                log.error("Error copying metrics file into HDFS, will remain in metrics directory.");
            }
            
            // reset reporter so that old metrics don't persist over time
            this.reporter = new StandaloneStatusReporter();
        }
    }
}