/* * Copyright 2018 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., * Inc., Kenilworth, NJ, USA. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.msd.gin.halyard.tools; import com.msd.gin.halyard.common.HalyardTableUtils; import com.yammer.metrics.core.Gauge; import java.io.IOException; import java.text.MessageFormat; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import org.apache.commons.cli.CommandLine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.protobuf.generated.AuthenticationProtos; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.htrace.Trace; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.Statement; import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFParser; import org.eclipse.rdf4j.rio.Rio; import org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler; import org.eclipse.rdf4j.rio.ntriples.NTriplesUtil; /** * * @author Adam Sotona (MSD) */ public final class HalyardBulkDelete extends AbstractHalyardTool { private static final String SUBJECT = "halyard.delete.subject"; private static final String PREDICATE = "halyard.delete.predicate"; private static final String OBJECT = "halyard.delete.object"; private static final String CONTEXTS = "halyard.delete.contexts"; static final SimpleValueFactory SVF = SimpleValueFactory.getInstance(); static final class DeleteMapper extends TableMapper<ImmutableBytesWritable, KeyValue> { long total = 0, deleted = 0; Resource subj; IRI pred; Value obj; List<Resource> ctx; @Override protected void setup(Context context) throws IOException, InterruptedException { SimpleValueFactory vf = SimpleValueFactory.getInstance(); Configuration conf = context.getConfiguration(); String s = conf.get(SUBJECT); if (s!= null) { subj = NTriplesUtil.parseResource(s, vf); } String p = conf.get(PREDICATE); if (p!= null) { pred = NTriplesUtil.parseURI(p, vf); } String o = conf.get(OBJECT); if (o!= null) { obj = NTriplesUtil.parseValue(o, vf); } String cs[] = conf.getStrings(CONTEXTS); if (cs != null) { ctx = new ArrayList<>(); for (String c : cs) { if ("NONE".equals(c)) { ctx.add(null); } else { ctx.add(NTriplesUtil.parseResource(c, vf)); } } } } @Override protected void map(ImmutableBytesWritable key, Result value, Context output) throws IOException, InterruptedException { for (Cell c : value.rawCells()) { Statement st = HalyardTableUtils.parseStatement(c, SVF); if ((subj == null || subj.equals(st.getSubject())) && (pred == null || pred.equals(st.getPredicate())) && (obj == null || obj.equals(st.getObject())) && (ctx == null || ctx.contains(st.getContext()))) { KeyValue kv = new KeyValue(c.getRowArray(), c.getRowOffset(), (int) c.getRowLength(), c.getFamilyArray(), c.getFamilyOffset(), (int) c.getFamilyLength(), c.getQualifierArray(), c.getQualifierOffset(), c.getQualifierLength(), c.getTimestamp(), KeyValue.Type.DeleteColumn, c.getValueArray(), c.getValueOffset(), c.getValueLength(), c.getTagsArray(), c.getTagsOffset(), c.getTagsLength()); output.write(new ImmutableBytesWritable(kv.getRowArray(), kv.getRowOffset(), kv.getRowLength()), kv); deleted++; } else { output.progress(); } if (total++ % 10000l == 0) { String msg = MessageFormat.format("{0} / {1} cells deleted", deleted, total); output.setStatus(msg); LOG.log(Level.INFO, msg); } } } } public HalyardBulkDelete() { super( "bulkdelete", "Halyard Bulk Delete is a MapReduce application that effectively deletes large set of triples or whole named graphs, based on specified statement pattern and/or named graph(s).", "Example: halyard bulkdelete -t my_data -f bulkdelete_temp1 -s <http://whatever/mysubj> -g <http://whatever/mygraph1> -g <http://whatever/mygraph2>" ); addOption("t", "target-dataset", "dataset_table", "HBase table with Halyard RDF store", true, true); addOption("f", "temp-folder", "temporary_folder", "Temporary folder for HBase files", true, true); addOption("s", "subject", "subject", "Optional subject to delete", false, true); addOption("p", "predicate", "predicate", "Optional predicate to delete", false, true); addOption("o", "object", "object", "Optional object to delete", false, true); addOption("g", "named-graph", "named_graph", "Optional named graph(s) to delete, NONE represents triples outside of any named graph", false, false); } @Override public int run(CommandLine cmd) throws Exception { String source = cmd.getOptionValue('t'); TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class, HBaseConfiguration.class, AuthenticationProtos.class, Trace.class, Gauge.class); HBaseConfiguration.addHbaseResources(getConf()); Job job = Job.getInstance(getConf(), "HalyardDelete " + source); if (cmd.hasOption('s')) { job.getConfiguration().set(SUBJECT, cmd.getOptionValue('s')); } if (cmd.hasOption('p')) { job.getConfiguration().set(PREDICATE, cmd.getOptionValue('p')); } if (cmd.hasOption('o')) { job.getConfiguration().set(OBJECT, cmd.getOptionValue('o')); } if (cmd.hasOption('g')) { job.getConfiguration().setStrings(CONTEXTS, cmd.getOptionValues('g')); } job.setJarByClass(HalyardBulkDelete.class); TableMapReduceUtil.initCredentials(job); Scan scan = HalyardTableUtils.scan(null, null); TableMapReduceUtil.initTableMapperJob(source, scan, DeleteMapper.class, ImmutableBytesWritable.class, LongWritable.class, job); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setSpeculativeExecution(false); job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), source, false, 0)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileOutputFormat.setOutputPath(job, new Path(cmd.getOptionValue('f'))); TableMapReduceUtil.addDependencyJars(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(cmd.getOptionValue('f')), hTable); LOG.info("Bulk Delete Completed.."); return 0; } } return -1; } }