package nl.sidnlabs.entrada.parquet; import java.util.HashMap; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import lombok.Value; import lombok.extern.log4j.Log4j2; import nl.sidnlabs.entrada.model.Partition; import nl.sidnlabs.entrada.util.FileUtil; @Log4j2 @Value public class ParquetPartitionWriter { private String path; // default max 3 million packets per files max (+/- 125mb files) private int maxRows; private Map<String, ParquetPartition<GenericRecord>> partitions = new HashMap<>(); public ParquetPartitionWriter(String path, int maxRows) { this.path = path; this.maxRows = maxRows; } public void write(GenericRecord rec, Schema schema, Partition partition) { String partitionStr = FileUtil.appendPath(path, partition.toPath()); // check is partition already exists, if not create a new partition ParquetPartition<GenericRecord> parquetPartition = partitions.computeIfAbsent(partitionStr, k -> new ParquetPartition<>(partitionStr, schema)); // write the rec to the partition parquetPartition.write(rec); // check if size of parquet partition is too big if (parquetPartition.getRows() >= maxRows) { log .info( "Max DNS packets reached for this Parquet parition {}, close current file and create new", partitionStr); parquetPartition.close(); // remove partition from partitions map, for a possible next row for this partitions // a new partition object and parquet file will be created. partitions.remove(partitionStr); } } public void close() { log.info("close {} partitions ", partitions.size()); // close writers, let them write all data to disk partitions.entrySet().stream().forEach(entry -> entry.getValue().close()); // make sure to clear the partitions map, otherwise the parquetwriters will remain in memory and // gc will not be able to free the memory partitions.clear(); } }