org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler Java Examples

The following examples show how to use org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PubchemTTLMerger.java    From act with GNU General Public License v3.0 6 votes vote down vote up
protected void buildIndex(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, List<File> rdfFiles)
    throws RocksDBException, ClassNotFoundException, IOException {
  LOGGER.info("Building RocksDB index of data in RDF files");
  RDFParser parser = Rio.createParser(RDFFormat.TURTLE);

  LOGGER.info("Processing %d RDF files", rdfFiles.size());
  for (File rdfFile : rdfFiles) {
    LOGGER.info("Processing file %s", rdfFile.getAbsolutePath());
    AbstractRDFHandler handler = PC_RDF_DATA_FILE_CONFIG.makeHandlerForDataFile(dbAndHandles, rdfFile);
    if (handler == null) {
      LOGGER.info("Skipping file without defined handler: %s", rdfFile.getAbsolutePath());
      continue;
    }

    parser.setRDFHandler(handler);
    parser.parse(new GZIPInputStream(new FileInputStream(rdfFile)), "");
    LOGGER.info("Successfully parsed file at %s", rdfFile.getAbsolutePath());
  }
  LOGGER.info("Done processing RDF files");
}
 
Example #2
Source File: HalyardExportTest.java    From Halyard with Apache License 2.0 6 votes vote down vote up
private static int getTriplesCount(String uri, String compression, RDFFormat format) throws Exception {
    InputStream in = FileSystem.get(URI.create(uri), HBaseServerTestInstance.getInstanceConfig()).open(new Path(uri));
    try {
        if (compression != null) {
            in = new CompressorStreamFactory().createCompressorInputStream(compression, in);
        }
        RDFParser parser = Rio.createParser(format);
        final AtomicInteger i = new AtomicInteger();
        parser.setRDFHandler(new AbstractRDFHandler(){
            @Override
            public void handleStatement(Statement st) throws RDFHandlerException {
                i.incrementAndGet();
            }
        });
        parser.parse(in, uri);
        return i.get();
    } finally {
        in.close();
    }
}
 
Example #3
Source File: InferenceEngine.java    From rya with Apache License 2.0 6 votes vote down vote up
private void refreshSomeValuesFromRestrictions(final Map<Resource, IRI> restrictions) throws QueryEvaluationException {
    someValuesFromByRestrictionType.clear();
    ryaDaoQueryWrapper.queryAll(null, OWL.SOMEVALUESFROM, null, new AbstractRDFHandler() {
        @Override
        public void handleStatement(final Statement statement) throws RDFHandlerException {
            final Resource restrictionClass = statement.getSubject();
            if (restrictions.containsKey(restrictionClass) && statement.getObject() instanceof Resource) {
                final IRI property = restrictions.get(restrictionClass);
                final Resource valueClass = (Resource) statement.getObject();
                // Should also be triggered by subclasses of the value class
                final Set<Resource> valueClasses = new HashSet<>();
                valueClasses.add(valueClass);
                if (valueClass instanceof IRI) {
                    valueClasses.addAll(getSubClasses((IRI) valueClass));
                }
                for (final Resource valueSubClass : valueClasses) {
                    if (!someValuesFromByRestrictionType.containsKey(restrictionClass)) {
                        someValuesFromByRestrictionType.put(restrictionClass, new ConcurrentHashMap<>());
                    }
                    someValuesFromByRestrictionType.get(restrictionClass).put(valueSubClass, property);
                }
            }
        }
    });
}
 
Example #4
Source File: InferenceEngine.java    From rya with Apache License 2.0 6 votes vote down vote up
private void refreshAllValuesFromRestrictions(final Map<Resource, IRI> restrictions) throws QueryEvaluationException {
    allValuesFromByValueType.clear();
    ryaDaoQueryWrapper.queryAll(null, OWL.ALLVALUESFROM, null, new AbstractRDFHandler() {
        @Override
        public void handleStatement(final Statement statement) throws RDFHandlerException {
            final Resource directRestrictionClass = statement.getSubject();
            if (restrictions.containsKey(directRestrictionClass) && statement.getObject() instanceof Resource) {
                final IRI property = restrictions.get(directRestrictionClass);
                final Resource valueClass = (Resource) statement.getObject();
                // Should also be triggered by subclasses of the property restriction
                final Set<Resource> restrictionClasses = new HashSet<>();
                restrictionClasses.add(directRestrictionClass);
                if (directRestrictionClass instanceof IRI) {
                    restrictionClasses.addAll(getSubClasses((IRI) directRestrictionClass));
                }
                for (final Resource restrictionClass : restrictionClasses) {
                    if (!allValuesFromByValueType.containsKey(valueClass)) {
                        allValuesFromByValueType.put(valueClass, new ConcurrentHashMap<>());
                    }
                    allValuesFromByValueType.get(valueClass).put(restrictionClass, property);
                }
            }
        }
    });
}
 
Example #5
Source File: PubchemTTLMerger.java    From act with GNU General Public License v3.0 6 votes vote down vote up
public static AbstractRDFHandler makeHandlerForDataFile(
    Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, File file) {
  PC_RDF_DATA_FILE_CONFIG config = getDataTypeForFile(file);
  if (config == null) {
    LOGGER.info("No handler config found for file %s", file.getAbsolutePath());
    return null;
  }
  LOGGER.info("Selected handler type %s for file %s", config.name(), file.getName());

  return new PCRDFHandler(
      dbAndHandles,
      config.columnFamily,
      config.keyType,
      config.valType,
      config.reverseSubjectAndObject,
      config.valueTransformer
  );
}
 
Example #6
Source File: RepositoryConnectionTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testInsertDelete() throws RDF4JException {
	final Statement stmt = vf.createStatement(vf.createIRI(URN_TEST_S1), vf.createIRI(URN_TEST_P1),
			vf.createIRI(URN_TEST_O1));
	testCon.begin();
	testCon.prepareUpdate(QueryLanguage.SPARQL,
			"INSERT DATA {<" + URN_TEST_S1 + "> <" + URN_TEST_P1 + "> <" + URN_TEST_O1 + ">}").execute();
	testCon.prepareUpdate(QueryLanguage.SPARQL,
			"DELETE DATA {<" + URN_TEST_S1 + "> <" + URN_TEST_P1 + "> <" + URN_TEST_O1 + ">}").execute();
	testCon.commit();

	testCon.exportStatements(null, null, null, false, new AbstractRDFHandler() {

		@Override
		public void handleStatement(Statement st) throws RDFHandlerException {
			assertThat(st).isNotEqualTo(stmt);
		}
	});
}
 
Example #7
Source File: RepositoryConnectionTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public final void testInsertRemove() throws RDF4JException {
	final Statement stmt = vf.createStatement(vf.createIRI(URN_TEST_S1), vf.createIRI(URN_TEST_P1),
			vf.createIRI(URN_TEST_O1));
	testCon.begin();
	testCon.prepareUpdate(QueryLanguage.SPARQL,
			"INSERT DATA {<" + URN_TEST_S1 + "> <" + URN_TEST_P1 + "> <" + URN_TEST_O1 + ">}").execute();
	testCon.remove(stmt);
	testCon.commit();

	testCon.exportStatements(null, null, null, false, new AbstractRDFHandler() {

		@Override
		public void handleStatement(Statement st) throws RDFHandlerException {
			assertThat(st).isNotEqualTo(stmt);
		}
	});
}
 
Example #8
Source File: RepositoryConnectionTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testAddDelete() throws RDF4JException {
	final Statement stmt = vf.createStatement(vf.createIRI(URN_TEST_S1), vf.createIRI(URN_TEST_P1),
			vf.createIRI(URN_TEST_O1));
	testCon.begin();
	testCon.add(stmt);
	testCon.prepareUpdate(QueryLanguage.SPARQL,
			"DELETE DATA {<" + URN_TEST_S1 + "> <" + URN_TEST_P1 + "> <" + URN_TEST_O1 + ">}").execute();
	testCon.commit();

	testCon.exportStatements(null, null, null, false, new AbstractRDFHandler() {

		@Override
		public void handleStatement(Statement st) throws RDFHandlerException {
			assertThat(st).isNotEqualTo(stmt);
		}
	});
}
 
Example #9
Source File: RepositoryConnectionTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testAddRemove() throws RDF4JException {
	final Statement stmt = vf.createStatement(vf.createIRI(URN_TEST_S1), vf.createIRI(URN_TEST_P1),
			vf.createIRI(URN_TEST_O1));
	testCon.begin();
	testCon.add(stmt);
	testCon.remove(stmt);
	testCon.commit();

	testCon.exportStatements(null, null, null, false, new AbstractRDFHandler() {

		@Override
		public void handleStatement(Statement st) throws RDFHandlerException {
			assertThat(st).isNotEqualTo(stmt);
		}
	});
}
 
Example #10
Source File: AbstractNQuadsParserUnitTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public void testNQuadsFile() throws Exception {
	RDFParser nquadsParser = createRDFParser();
	nquadsParser.setRDFHandler(new AbstractRDFHandler() {
	});

	try (InputStream in = AbstractNQuadsParserUnitTest.class.getResourceAsStream(NQUADS_TEST_FILE)) {
		nquadsParser.parse(in, NQUADS_TEST_URL);
	} catch (RDFParseException e) {
		fail("NQuadsParser failed to parse N-Quads test document: " + e.getMessage());
	}
}
 
Example #11
Source File: AbstractNQuadsParserUnitTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * The N-Quads parser must be able to parse the N-Triples test file without error.
 */
public void testNTriplesFile() throws Exception {
	RDFParser nquadsParser = createRDFParser();
	nquadsParser.setRDFHandler(new AbstractRDFHandler() {
	});

	try (InputStream in = AbstractNQuadsParserUnitTest.class.getResourceAsStream(NTRIPLES_TEST_FILE)) {
		nquadsParser.parse(in, NTRIPLES_TEST_URL);
	} catch (RDFParseException e) {
		fail("NQuadsParser failed to parse N-Triples test document: " + e.getMessage());
	}
}
 
Example #12
Source File: StatementsDeserializer.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
public Set<Statement> deserialize(final String topic, final byte[] data) {
    if(data == null || data.length == 0) {
        // Return null because that is the contract of this method.
        return null;
    }

    try {
        final RDFParser parser = PARSER_FACTORY.getParser();
        final Set<Statement> statements = new HashSet<>();

        parser.setRDFHandler(new AbstractRDFHandler() {
            @Override
            public void handleStatement(final Statement statement) throws RDFHandlerException {
                log.debug("Statement: " + statement);
                statements.add( statement );
            }
        });

        parser.parse(new ByteArrayInputStream(data), null);
        return statements;

    } catch(final RDFParseException | RDFHandlerException | IOException e) {
        log.error("Could not deserialize a Set of VisibilityStatement objects using the RDF4J Rio Binary format.", e);
        return null;
    }
}
 
Example #13
Source File: InferenceEngine.java    From rya with Apache License 2.0 5 votes vote down vote up
/**
 * Query for and collect all instances of a given type. Should only be called for types expected
 * to have few members, such as ontology vocabulary terms, as instances will be collected in
 * memory.
 */
private Set<IRI> fetchInstances(final IRI type) throws QueryEvaluationException {
    final Set<IRI> instances = new HashSet<>();
    ryaDaoQueryWrapper.queryAll(null, RDF.TYPE, type, new AbstractRDFHandler() {
        @Override
        public void handleStatement(final Statement st) throws RDFHandlerException {
            if (st.getSubject() instanceof IRI) {
                instances.add((IRI) st.getSubject());
            }
        }
    });
    return instances;
}
 
Example #14
Source File: HalyardStatsTest.java    From Halyard with Apache License 2.0 5 votes vote down vote up
@Test
public void testStatsTargetPartial() throws Exception {
    final HBaseSail sail = new HBaseSail(HBaseServerTestInstance.getInstanceConfig(), "statsTable3", true, -1, true, 0, null, null);
    sail.initialize();
    try (InputStream ref = HalyardStatsTest.class.getResourceAsStream("testData.trig")) {
        RDFParser p = Rio.createParser(RDFFormat.TRIG);
        p.setPreserveBNodeIDs(true);
        p.setRDFHandler(new AbstractRDFHandler() {
            @Override
            public void handleStatement(Statement st) throws RDFHandlerException {
                sail.addStatement(st.getSubject(), st.getPredicate(), st.getObject(), st.getContext());
            }
        }).parse(ref, "");
    }
    sail.commit();
    sail.close();

    File root = File.createTempFile("test_stats", "");
    root.delete();
    root.mkdirs();

    assertEquals(0, ToolRunner.run(HBaseServerTestInstance.getInstanceConfig(), new HalyardStats(),
            new String[]{"-s", "statsTable3", "-t", root.toURI().toURL().toString() + "stats{0}.trig", "-r", "100", "-g", "http://whatever/myStats", "-c", "http://whatever/graph0"}));

    File stats = new File(root, "stats0.trig");
    assertTrue(stats.isFile());
    try (InputStream statsStream = new FileInputStream(stats)) {
        try (InputStream refStream = HalyardStatsTest.class.getResourceAsStream("testStatsTargetPartial.trig")) {
            Model statsM = Rio.parse(statsStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger());
            Model refM = Rio.parse(refStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger(), SimpleValueFactory.getInstance().createIRI("http://whatever/myStats"));
            assertEqualModels(refM, statsM);
        }
    }
}
 
Example #15
Source File: HalyardStatsTest.java    From Halyard with Apache License 2.0 5 votes vote down vote up
@Test
public void testStatsTarget() throws Exception {
    final HBaseSail sail = new HBaseSail(HBaseServerTestInstance.getInstanceConfig(), "statsTable", true, -1, true, 0, null, null);
    sail.initialize();
    try (InputStream ref = HalyardStatsTest.class.getResourceAsStream("testData.trig")) {
        RDFParser p = Rio.createParser(RDFFormat.TRIG);
        p.setPreserveBNodeIDs(true);
        p.setRDFHandler(new AbstractRDFHandler() {
            @Override
            public void handleStatement(Statement st) throws RDFHandlerException {
                sail.addStatement(st.getSubject(), st.getPredicate(), st.getObject(), st.getContext());
            }
        }).parse(ref, "");
    }
    sail.commit();
    sail.close();

    File root = File.createTempFile("test_stats", "");
    root.delete();
    root.mkdirs();

    assertEquals(0, ToolRunner.run(HBaseServerTestInstance.getInstanceConfig(), new HalyardStats(),
            new String[]{"-s", "statsTable", "-t", root.toURI().toURL().toString() + "stats{0}.trig", "-r", "100", "-g", "http://whatever/myStats"}));

    File stats = new File(root, "stats0.trig");
    assertTrue(stats.isFile());
    try (InputStream statsStream = new FileInputStream(stats)) {
        try (InputStream refStream = HalyardStatsTest.class.getResourceAsStream("testStatsTarget.trig")) {
            Model statsM = Rio.parse(statsStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger());
            Model refM = Rio.parse(refStream, "", RDFFormat.TRIG, new ParserConfig().set(BasicParserSettings.PRESERVE_BNODE_IDS, true), SimpleValueFactory.getInstance(), new ParseErrorLogger(), SimpleValueFactory.getInstance().createIRI("http://whatever/myStats"));
            assertEqualModels(refM, statsM);
        }
    }
}
 
Example #16
Source File: InferenceEngine.java    From rya with Apache License 2.0 4 votes vote down vote up
private void refreshOneOf() throws QueryEvaluationException {
    final Map<Resource, Set<Resource>> enumTypes = new HashMap<>();

    // First query for all the owl:oneOf's.
    // If we have the following oneOf:
    // :A owl:oneOf (:B, :C)
    // It will be represented by triples following a pattern similar to:
    // <:A> owl:oneOf _:bnode1 .
    //  _:bnode1 rdf:first <:B> .
    //  _:bnode1 rdf:rest _:bnode2 .
    // _:bnode2 rdf:first <:C> .
    // _:bnode2 rdf:rest rdf:nil .
    ryaDaoQueryWrapper.queryAll(null, OWL.ONEOF, null, new AbstractRDFHandler() {
        @Override
        public void handleStatement(final Statement statement) throws RDFHandlerException {
            final Resource enumType = statement.getSubject();
            // listHead will point to a type class of the enumeration.
            final IRI listHead = (IRI) statement.getObject();
            if (!enumTypes.containsKey(enumType)) {
                enumTypes.put(enumType, new LinkedHashSet<Resource>());
            }

            // listHead should point to a list of items that forms the
            // enumeration.
            try {
                final Set<Resource> enumeration = new LinkedHashSet<>(getList(listHead));
                if (!enumeration.isEmpty()) {
                    // Add this enumeration for this type.
                    enumTypes.get(enumType).addAll(enumeration);
                }
            } catch (final QueryEvaluationException e) {
                throw new RDFHandlerException("Error getting enumeration list.", e);
            }
        }
    });

    synchronized(enumerations) {
        enumerations.clear();
        enumerations.putAll(enumTypes);
    }
}
 
Example #17
Source File: KafkaLoadStatements.java    From rya with Apache License 2.0 4 votes vote down vote up
@Override
public void fromFile(final Path statementsPath, final String visibilities) throws RyaStreamsException {
    requireNonNull(statementsPath);
    requireNonNull(visibilities);

    if(!statementsPath.toFile().exists()) {
        throw new RyaStreamsException("Could not load statements at path '" + statementsPath + "' because that " +
                "does not exist. Make sure you've entered the correct path.");
    }

    // Create an RDF Parser whose format is derived from the statementPath's file extension.
    final String filename = statementsPath.getFileName().toString();
    final RDFFormat format = RdfFormatUtils.forFileName(filename);
    if (format == null) {
        throw new UnsupportedRDFormatException("Unknown RDF format for the file: " + filename);
    }
    final RDFParser parser = Rio.createParser(format);

    // Set a handler that writes the statements to the specified kafka topic.
    parser.setRDFHandler(new AbstractRDFHandler() {
        @Override
        public void startRDF() throws RDFHandlerException {
            log.trace("Starting loading statements.");
        }

        @Override
        public void handleStatement(final Statement stmnt) throws RDFHandlerException {
            final VisibilityStatement visiStatement = new VisibilityStatement(stmnt, visibilities);
            producer.send(new ProducerRecord<>(topic, visiStatement));
        }

        @Override
        public void endRDF() throws RDFHandlerException {
            producer.flush();
            log.trace("Done.");
        }
    });

    // Do the parse and load.
    try {
        parser.parse(Files.newInputStream(statementsPath), "");
    } catch (RDFParseException | RDFHandlerException | IOException e) {
        throw new RyaStreamsException("Could not load the RDF file's Statements into Rya Streams.", e);
    }
}
 
Example #18
Source File: HalyardBulkUpdate.java    From Halyard with Apache License 2.0 4 votes vote down vote up
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String queryFiles = cmd.getOptionValue('q');
    String workdir = cmd.getOptionValue('w');
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
    if (cmd.hasOption('i')) getConf().set(ELASTIC_INDEX_URL, cmd.getOptionValue('i'));
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           NTriplesUtil.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    getConf().setStrings(TABLE_NAME_PROPERTY, source);
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, getConf().getLong(DEFAULT_TIMESTAMP_PROPERTY, System.currentTimeMillis()));
    int stages = 1;
    for (int stage = 0; stage < stages; stage++) {
        Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + workdir + " -> " + source + " stage #" + stage);
        job.getConfiguration().setInt(STAGE_PROPERTY, stage);
        job.setJarByClass(HalyardBulkUpdate.class);
        job.setMapperClass(SPARQLUpdateMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);
        job.setInputFormatClass(QueryInputFormat.class);
        job.setSpeculativeExecution(false);
        job.setReduceSpeculativeExecution(false);
        try (HTable hTable = HalyardTableUtils.getTable(getConf(), source, false, 0)) {
            HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
            QueryInputFormat.setQueriesFromDirRecursive(job.getConfiguration(), queryFiles, true, stage);
            Path outPath = new Path(workdir, "stage"+stage);
            FileOutputFormat.setOutputPath(job, outPath);
            TableMapReduceUtil.addDependencyJars(job);
            TableMapReduceUtil.initCredentials(job);
            if (stage == 0) { //count real number of stages
                for (InputSplit is : new QueryInputFormat().getSplits(job)) {
                    QueryInputFormat.QueryInputSplit qis = (QueryInputFormat.QueryInputSplit)is;
                    int updates = QueryParserUtil.parseUpdate(QueryLanguage.SPARQL, qis.getQuery(), null).getUpdateExprs().size();
                    if (updates > stages) {
                        stages = updates;
                    }
                    LOG.log(Level.INFO, "{0} contains {1} stages of the update sequence.", new Object[]{qis.getQueryName(), updates});
                }
                LOG.log(Level.INFO, "Bulk Update will process {0} MapReduce stages.", stages);
            }
            if (job.waitForCompletion(true)) {
                new LoadIncrementalHFiles(getConf()).doBulkLoad(outPath, hTable);
                LOG.log(Level.INFO, "Stage #{0} of {1} completed..", new Object[]{stage, stages});
            } else {
                return -1;
            }
        }
    }
    LOG.info("Bulk Update Completed..");
    return 0;
}
 
Example #19
Source File: HalyardBulkLoad.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
protected int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String workdir = cmd.getOptionValue('w');
    String target = cmd.getOptionValue('t');
    getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i'));
    getConf().setBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, cmd.hasOption('d'));
    getConf().setBoolean(TRUNCATE_PROPERTY, cmd.hasOption('r'));
    getConf().setInt(SPLIT_BITS_PROPERTY, Integer.parseInt(cmd.getOptionValue('b', "3")));
    if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g'));
    getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o'));
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
    if (cmd.hasOption('m')) getConf().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(cmd.getOptionValue('m')));
    TableMapReduceUtil.addDependencyJars(getConf(),
            NTriplesUtil.class,
            Rio.class,
            AbstractRDFHandler.class,
            RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + workdir + " -> " + target);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, source);
        FileOutputFormat.setOutputPath(job, new Path(workdir));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) {
                HalyardTableUtils.truncateTable(hTable).close();
            }
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(workdir), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}
 
Example #20
Source File: HalyardPreSplit.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
protected int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String target = cmd.getOptionValue('t');
    try (Connection con = ConnectionFactory.createConnection(getConf())) {
        try (Admin admin = con.getAdmin()) {
            if (admin.tableExists(TableName.valueOf(target))) {
                LOG.log(Level.WARNING, "Pre-split cannot modify already existing table {0}", target);
                return -1;
            }
        }
    }
    getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i'));
    if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g'));
    getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o'));
    TableMapReduceUtil.addDependencyJars(getConf(),
            NTriplesUtil.class,
            Rio.class,
            AbstractRDFHandler.class,
            RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, getConf().getLong(DEFAULT_TIMESTAMP_PROPERTY, System.currentTimeMillis()));
    getConf().setInt(DECIMATION_FACTOR_PROPERTY, Integer.parseInt(cmd.getOptionValue('d', String.valueOf(DEFAULT_DECIMATION_FACTOR))));
    getConf().setLong(SPLIT_LIMIT_PROPERTY, Long.parseLong(cmd.getOptionValue('l', String.valueOf(DEFAULT_SPLIT_LIMIT))));
    Job job = Job.getInstance(getConf(), "HalyardPreSplit -> " + target);
     job.getConfiguration().set(TABLE_PROPERTY, target);
    job.setJarByClass(HalyardPreSplit.class);
    job.setMapperClass(RDFDecimatingMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, source);
    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.initCredentials(job);
    job.setReducerClass(PreSplitReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputFormatClass(NullOutputFormat.class);
    if (job.waitForCompletion(true)) {
        LOG.info("PreSplit Calculation Completed..");
        return 0;
    }
    return -1;
}
 
Example #21
Source File: HalyardSummary.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String target = cmd.getOptionValue('t');
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardSummary " + source + (target == null ? " update" : " -> " + target));
    job.getConfiguration().set(SOURCE, source);
    if (target != null) job.getConfiguration().set(TARGET, target);
    if (cmd.hasOption('g')) job.getConfiguration().set(TARGET_GRAPH, cmd.getOptionValue('g'));
    if (cmd.hasOption('d')) job.getConfiguration().setInt(DECIMATION_FACTOR, Integer.parseInt(cmd.getOptionValue('d')));
    job.setJarByClass(HalyardSummary.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(new byte[]{HalyardTableUtils.POS_PREFIX}, new byte[]{HalyardTableUtils.POS_PREFIX + 1});

    TableMapReduceUtil.initTableMapperJob(source,
            scan,
            SummaryMapper.class,
            ImmutableBytesWritable.class,
            LongWritable.class,
            job);
    job.setNumReduceTasks(1);
    job.setCombinerClass(SummaryCombiner.class);
    job.setReducerClass(SummaryReducer.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    if (job.waitForCompletion(true)) {
        LOG.info("Summary Generation Completed..");
        return 0;
    }
    return -1;
}
 
Example #22
Source File: HalyardBulkDelete.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('t');
    TableMapReduceUtil.addDependencyJars(getConf(),
        HalyardExport.class,
        NTriplesUtil.class,
        Rio.class,
        AbstractRDFHandler.class,
        RDFFormat.class,
        RDFParser.class,
        HTable.class,
        HBaseConfiguration.class,
        AuthenticationProtos.class,
        Trace.class,
        Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardDelete " + source);
    if (cmd.hasOption('s')) {
        job.getConfiguration().set(SUBJECT, cmd.getOptionValue('s'));
    }
    if (cmd.hasOption('p')) {
        job.getConfiguration().set(PREDICATE, cmd.getOptionValue('p'));
    }
    if (cmd.hasOption('o')) {
        job.getConfiguration().set(OBJECT, cmd.getOptionValue('o'));
    }
    if (cmd.hasOption('g')) {
        job.getConfiguration().setStrings(CONTEXTS, cmd.getOptionValues('g'));
    }
    job.setJarByClass(HalyardBulkDelete.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(null, null);

    TableMapReduceUtil.initTableMapperJob(source,
        scan,
        DeleteMapper.class,
        ImmutableBytesWritable.class,
        LongWritable.class,
        job);

    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setSpeculativeExecution(false);
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), source, false, 0)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileOutputFormat.setOutputPath(job, new Path(cmd.getOptionValue('f')));
        TableMapReduceUtil.addDependencyJars(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(cmd.getOptionValue('f')), hTable);
            LOG.info("Bulk Delete Completed..");
            return 0;
        }
    }
    return -1;
}
 
Example #23
Source File: HalyardBulkExport.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
protected int run(CommandLine cmd) throws Exception {
    if (!cmd.getArgList().isEmpty()) throw new HalyardExport.ExportException("Unknown arguments: " + cmd.getArgList().toString());
    String source = cmd.getOptionValue('s');
    String queryFiles = cmd.getOptionValue('q');
    String target = cmd.getOptionValue('t');
    if (!target.contains("{0}")) {
        throw new HalyardExport.ExportException("Bulk export target must contain '{0}' to be replaced by stripped filename of the actual SPARQL query.");
    }
    getConf().set(SOURCE, source);
    getConf().set(TARGET, target);
    String driver = cmd.getOptionValue('c');
    if (driver != null) {
        getConf().set(JDBC_DRIVER, driver);
    }
    String props[] = cmd.getOptionValues('p');
    if (props != null) {
        for (int i=0; i<props.length; i++) {
            props[i] = Base64.encodeBase64String(props[i].getBytes(StandardCharsets.UTF_8));
        }
        getConf().setStrings(JDBC_PROPERTIES, props);
    }
    if (cmd.hasOption('i')) getConf().set(HalyardBulkUpdate.ELASTIC_INDEX_URL, cmd.getOptionValue('i'));
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           NTriplesUtil.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    String cp = cmd.getOptionValue('l');
    if (cp != null) {
        String jars[] = cp.split(":");
        StringBuilder newCp = new StringBuilder();
        for (int i=0; i<jars.length; i++) {
            if (i > 0) newCp.append(':');
            newCp.append(addTmpFile(jars[i])); //append clappspath entris to tmpfiles and trim paths from the classpath
        }
        getConf().set(JDBC_CLASSPATH, newCp.toString());
    }
    Job job = Job.getInstance(getConf(), "HalyardBulkExport " + source + " -> " + target);
    job.setJarByClass(HalyardBulkExport.class);
    job.setMaxMapAttempts(1);
    job.setMapperClass(BulkExportMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Void.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(QueryInputFormat.class);
    QueryInputFormat.setQueriesFromDirRecursive(job.getConfiguration(), queryFiles, false, 0);
    job.setOutputFormatClass(NullOutputFormat.class);
    TableMapReduceUtil.initCredentials(job);
    if (job.waitForCompletion(true)) {
        LOG.info("Bulk Export Completed..");
        return 0;
    }
    return -1;
}
 
Example #24
Source File: HalyardStats.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String target = cmd.getOptionValue('t');
    String targetGraph = cmd.getOptionValue('g');
    String graphContext = cmd.getOptionValue('c');
    String thresh = cmd.getOptionValue('r');
    TableMapReduceUtil.addDependencyJars(getConf(),
        HalyardExport.class,
        NTriplesUtil.class,
        Rio.class,
        AbstractRDFHandler.class,
        RDFFormat.class,
        RDFParser.class,
        HTable.class,
        HBaseConfiguration.class,
        AuthenticationProtos.class,
        Trace.class,
        Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardStats " + source + (target == null ? " update" : " -> " + target));
    job.getConfiguration().set(SOURCE, source);
    if (target != null) job.getConfiguration().set(TARGET, target);
    if (targetGraph != null) job.getConfiguration().set(TARGET_GRAPH, targetGraph);
    if (graphContext != null) job.getConfiguration().set(GRAPH_CONTEXT, graphContext);
    if (thresh != null) job.getConfiguration().setLong(THRESHOLD, Long.parseLong(thresh));
    job.setJarByClass(HalyardStats.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(null, null);
    if (graphContext != null) { //restricting stats to scan given graph context only
        List<RowRange> ranges = new ArrayList<>();
        byte[] gcHash = HalyardTableUtils.hashKey(SimpleValueFactory.getInstance().createIRI(graphContext));
        ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, gcHash));
        ranges.add(rowRange(HalyardTableUtils.CPOS_PREFIX, gcHash));
        ranges.add(rowRange(HalyardTableUtils.COSP_PREFIX, gcHash));
        if (target == null) { //add stats context to the scanned row ranges (when in update mode) to delete the related stats during MapReduce
            ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, HalyardTableUtils.hashKey(targetGraph == null ? HALYARD.STATS_GRAPH_CONTEXT : SimpleValueFactory.getInstance().createIRI(targetGraph))));
        }
        scan.setFilter(new MultiRowRangeFilter(ranges));
    }
    TableMapReduceUtil.initTableMapperJob(
        source,
        scan,
        StatsMapper.class,
        ImmutableBytesWritable.class,
        LongWritable.class,
        job);
    job.setPartitionerClass(StatsPartitioner.class);
    job.setReducerClass(StatsReducer.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    if (job.waitForCompletion(true)) {
        LOG.info("Stats Generation Completed..");
        return 0;
    }
    return -1;
}