package eu.linkedeodata.geotriples; import be.ugent.mmlab.rml.core.RMLMappingFactory; import be.ugent.mmlab.rml.function.Function; import be.ugent.mmlab.rml.function.FunctionFactory; import be.ugent.mmlab.rml.model.RMLMapping; import be.ugent.mmlab.rml.model.TriplesMap; import eu.linkedeodata.geotriples.Converters.RML_Converter; import eu.linkedeodata.geotriples.utils.SparkReader; import javafx.util.Pair; import jena.cmdline.ArgDecl; import jena.cmdline.CommandLine; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.serializer.KryoSerializer; import org.apache.spark.sql.*; import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator; import org.openrdf.model.URI; import java.io.IOException; import java.util.*; import scala.Tuple2; import scala.reflect.ClassTag; /* * Generate Mapping file: * java -cp geotriples-core/target/geotriples-dependencies.jar eu.linkedeodata.geotriples.GeoTriplesCMD generate_mapping -rml -o <out> -b http://example.com <in> * Dump RDF: * java -cp geotriples-core/target/geotriples-dependencies.jar eu.linkedeodata.geotriples.GeoTriplesCMD dump_rdf -o <out> -b http://example.com -sh <shp> <rml> * * Execute Spark implementation: * spark-submit --master local[*] --class eu.linkedeodata.geotriples.GeoTriplesCMD geotriples-core/target/geotriples-dependencies.jar spark -i <in_file> -o <out_folder> <rml> * Execute in Hadoop cluster using YARN: * spark-submit --master yarn --deploy-mode cluster --class eu.linkedeodata.geotriples.GeoTriplesCMD geotriples-core/target/geotriples-dependencies.jar spark -i <hdfs_path> -o <hdfs_path> <hdfs_path> * Debug: * spark-submit --master local[*] --conf spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005 --class eu.linkedeodata.geotriples.GeoTriplesCMD geotriples-core/target/geotriples-dependencies.jar spark -i <in_file> -o <out_folder> <rml> * Execute using VisualVM profiler: * spark-submit --executor-memory 10g --driver-memory 10g --conf "spark.driver.extraJavaOptions=-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.port=8090 -Dcom.sun.management.jmxremote.rmi.port=8091 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=localhost" --master local[*] --class eu.linkedeodata.geotriples.GeoTriplesCMD geotriples-core/target/geotriples-dependencies.jar spark -i <in_file> -o <out_folder> <rml> * * eu.linkedeodata.geotriples.GeoTriplesCMD * hops -i hdfs:///Projects/testHops/Experiments/Cyprus_pois.csv -o test hdfs:///Projects/testHops/Experiments/cyprus_csv.ttl * * */ /** * Perform the conversion big geo-spatial data into RDF triples using the Apache Spark. * It is designed to be executed in HopsWork. */ public class SparkMaster { private enum Mode { ROW, PARTITION } private SparkSession spark; private SparkReader reader; private String inputFile; private String outputDir; private String mappingFile; private JavaRDD<Row> rowRDD; private FileSystem fs; private Mode mode; private String repartition = null; private Logger log; /** * Constructor. Parses the input arguments and configure Spark. * * @param inputArgs: input command line arguments. */ SparkMaster(String[] inputArgs){ // set loggers level log = Logger.getLogger("GEOTRIPLES-SPARK"); log.setLevel(Level.INFO); log.info("\n"); //Logger.getLogger("org").setLevel(Level.ERROR); //Logger.getRootLogger().setLevel(Level.ERROR); boolean is_shp_folder = false; String outputDirArg; try { // parse the input arguments CommandLine cmd = new CommandLine(); ArgDecl outDirArg = new ArgDecl(true, "-o", "out", "outfile"); ArgDecl infileArg = new ArgDecl(true, "-i", "in", "infile"); ArgDecl inSHFArg = new ArgDecl(true, "-sh", "sh", "shapefileForlder"); ArgDecl repartitionArg = new ArgDecl(true, "-r", "re", "repartition"); ArgDecl helpArg = new ArgDecl(false, "-h", "help"); ArgDecl modeArg = new ArgDecl(true, "-m", "mode"); ArgDecl timesArg = new ArgDecl(true, "-times", "times"); cmd.add(outDirArg); cmd.add(infileArg); cmd.add(inSHFArg); cmd.add(repartitionArg); cmd.add(helpArg); cmd.add(modeArg); cmd.add(timesArg); cmd.process(inputArgs); boolean usage = false; List<String> errors = new ArrayList<>(); if (cmd.hasArg(helpArg)) usage(null); if (cmd.hasArg(outDirArg)) { outputDirArg = cmd.getArg(outDirArg).getValue(); outputDir = outputDirArg; if(outputDir.contains("/") && outputDir.lastIndexOf("/") == outputDir.length() -1) outputDir = outputDir.substring(0, outputDir.length() - 1); Path outputDirArg_path = new Path(outputDir); Configuration conf = new Configuration(); fs = FileSystem.get(conf); // Create the output folder // if the specified directory does not exist do nothing // if the specified directory does exist, create a new one inside the specified one if (fs.exists(outputDirArg_path)) { if (!fs.isDirectory(outputDirArg_path)) { usage = true; errors.add("ERROR: \"-o\" flag must point to a directory."); } else { outputDir = outputDir + "/GeoTriples_results"; outputDirArg_path = new Path(outputDir); } } try { int i = 1; String temp_name = outputDir; while (fs.exists(outputDirArg_path)) { outputDir = temp_name + "_" + i; outputDirArg_path = new Path(outputDir); i++; } if (!outputDirArg.equals(outputDir)) log.warn("Because the " + outputDirArg + " already exists, the results will be located in " + outputDir); } catch (Exception e){ e.printStackTrace(); System.exit(1); } log.info("The results will be located in " + outputDir); } else { usage = true; errors.add("ERROR: You need to specify the output directory using the \"-o\" flag."); } if (cmd.hasArg(infileArg)){ inputFile = cmd.getArg(infileArg).getValue(); if (cmd.hasArg(timesArg)){ // it is used in order to load a single file multiple times StringBuilder sb = new StringBuilder(); int times_to_load = Integer.parseInt(cmd.getArg(timesArg).getValue()); for (int i = 0; i < times_to_load - 1; i++) sb.append(inputFile).append(","); sb.append(inputFile); inputFile = sb.toString(); } } else if (cmd.hasArg(inSHFArg)){ log.info("Shapefile detected."); inputFile = cmd.getArg(inSHFArg).getValue(); Path shp_folder = new Path(inputFile); Configuration conf = new Configuration(); fs = FileSystem.get(conf); if (!fs.isDirectory(shp_folder)) { usage = true; errors.add("ERROR: \"-sh\" flag must point to a directory."); } is_shp_folder = true; } else { usage = true; errors.add("ERROR: You need to specify the input file using the \"-i\" flag."); } if (cmd.hasArg(modeArg)){ String in_mode = cmd.getArg(modeArg).getValue(); if (in_mode.equals("row")) mode = Mode.ROW; else if (in_mode.equals("partition")) mode = Mode.PARTITION; else { log.warn("No mode \"" + in_mode + "\". The conversion mode is set to \"per ROW conversion\"."); mode = Mode.ROW; } } else mode = Mode.ROW; log.info("The conversion mode is set to \"per " + mode.name() + " conversion\"."); if (cmd.hasArg(repartitionArg)) repartition = cmd.getArg(repartitionArg).getValue(); if (inputArgs[inputArgs.length - 1].endsWith(".ttl")) mappingFile = inputArgs[inputArgs.length - 1]; else { usage = true; errors.add("ERROR: The last argument must be the mapping file and it must end with the extension .ttl"); } if (usage) usage(errors); } catch (IllegalArgumentException | IOException e){ e.printStackTrace(); usage(null); } // configure spark SparkConf conf = new SparkConf() .set("spark.serializer", KryoSerializer.class.getName()) .set("spark.kryo.registrator", GeoSparkKryoRegistrator.class.getName()) .set("spark.hadoop.validateOutputSpecs", "false") .set("spark.serializer", KryoSerializer.class.getName()); // Shapefiles require more SparkMemory for shuffling if (repartition != null) conf.set("spark.memory.fraction", "0.5"); else conf.set("spark.memory.fraction", "0.2"); spark = SparkSession .builder() .appName("GeoTriples-Spark") .config(conf) .getOrCreate(); System.setProperty("geospark.global.charset", "utf8"); reader = new SparkReader(inputFile, is_shp_folder, spark); } /** * Read input according to its file-type and store it as a spark Dataset. */ public void readInput() { rowRDD = reader.read(repartition); } /** * Convert the produced Dataset into rdf triples and store the results in a directory. * The Conversion can be either per row or per partition and it is defined by the user. * If the user did not define the conversion mode, the conversion mode will be per row conversion. */ public void convert2RDF() { // data that will be passed to the conversion List<String> headers = Arrays.asList(reader.getHeaders()); ArrayList<TriplesMap> mapping_list = RML_Parser(mappingFile); long startTime = System.currentTimeMillis(); log.info("Starts the conversion"); switch (mode) { case ROW: log.info("Conversion mode: Per Row Conversion"); convert_row(mapping_list); break; case PARTITION: log.info("Conversion mode: Per Partition Conversion"); convert_partition(mapping_list); break; } log.info("The conversion completed and took " + (System.currentTimeMillis() - startTime) + " msec.\n"); } /** * Convert the input Dataset into RDF triples and store the results. * The conversion is taking place per Partitions using the mapPartition Spark transformation. * @param mapping_list list of TripleMaps */ private void convert_partition(ArrayList<TriplesMap> mapping_list){ SparkContext sc = SparkContext.getOrCreate(); Pair<ArrayList<TriplesMap>, List<String>> transformation_info = new Pair<>(mapping_list, Arrays.asList(reader.getHeaders())); ClassTag<Pair<ArrayList<TriplesMap>, List<String>>> classTag_pair = scala.reflect.ClassTag$.MODULE$.apply(Pair.class); Broadcast<Pair<ArrayList<TriplesMap>, List<String>>> bd_info = sc.broadcast(transformation_info, classTag_pair); rowRDD .mapPartitions( (Iterator<Row> rows_iter) -> { ArrayList<TriplesMap> p_mapping_list = bd_info.value().getKey(); List<String> p_header = bd_info.value().getValue(); RML_Converter rml_converter = new RML_Converter(p_mapping_list, p_header); rml_converter.start(); rml_converter.registerFunctions(); Iterator<String> triples = rml_converter.convertPartition(rows_iter); rml_converter.stop(); return triples; }) .saveAsTextFile(outputDir); } /** * Convert the input Dataset into RDF triples and store the results. * The conversion is taking place per Per using the map Spark transformation. * @param mapping_list list of TripleMaps */ private void convert_row(ArrayList<TriplesMap> mapping_list){ SparkContext sc = SparkContext.getOrCreate(); RML_Converter rml_converter = new RML_Converter(mapping_list, Arrays.asList(reader.getHeaders())); ClassTag<RML_Converter> classTagRML_Converter = scala.reflect.ClassTag$.MODULE$.apply(RML_Converter.class); Broadcast<RML_Converter> bc_converter = sc.broadcast(rml_converter, classTagRML_Converter); ClassTag<HashMap<URI, Function>> classTag_hashMap = scala.reflect.ClassTag$.MODULE$.apply(HashMap.class); Broadcast<HashMap<URI, Function>> bc_functionsHashMap = sc.broadcast(FunctionFactory.availableFunctions, classTag_hashMap); rowRDD .map((row) -> { FunctionFactory.availableFunctions = bc_functionsHashMap.value(); return bc_converter.value().convertRow(row); } ) .saveAsTextFile(outputDir); } /** * Close the Spark session. */ public void endSpark(){ spark.close();} /** * * Parse the RML file and produce the Triple Map. * * It used to be inside the converter but, but it is better here because * we want to avoid every Executor to read the mapping file. * Reading invokes disk I/O and Network I/O and therefore is expensive! * * @param mappingFile: the input mapping file. */ private ArrayList<TriplesMap> RML_Parser(String mappingFile) { try { Path inFile = new Path(mappingFile); FSDataInputStream in = fs.open(inFile); RMLMapping mapping = RMLMappingFactory.extractRMLMapping(in); return new ArrayList<>(mapping.getTriplesMaps()); } catch (Exception e) { log.error("ERROR Initializing RML_Converter"); e.printStackTrace(); System.exit(1); } return null; } /** * Print Usage. * * @param errors: list of errors. */ private void usage(List<String> errors){ if (errors != null ){ log.error("\n"); for (String error : errors) log.error(error); log.error("\n"); } log.info(".___________________________________________________________."); log.info("|\\._______________________________________________________./|"); log.info("|\\| GeoTriples |/|"); log.info("|\\| a tool for transforming EO/geospatial data into RDF |/|"); log.info(" \\._______________________________________________________./ "); log.info("\n"); log.info("Usage for Spark mode: [arguments] <source mapping>"); log.info("\tArguments:"); log.info("\t\t-o <outDir>\t\tOutput directory name"); log.info("\t\t-i <inFile>\t\tPath to the input file"); log.info("\t\t-m mode\t\tDefine the conversion mode. Accepted values: \"partition\", \"row\"(default)"); log.info("\t\t-r <partitions>\t\t(Optional) Specify the number of the requested partitions. If it is set to \"default\", then the number of partitions will be calculated based on the size of the input."); log.info("\t\t-sh <directory>\t\t\t Path that points to a directory containing multiple folders of shapefiles. Used to load multiple shapefiles"); log.info("\t\t-times <n>\t\t\t Load the input dataset <n> times"); log.info("\t\t-h \t\t\tPrint usage"); log.info("\n"); log.info("\n"); log.info("\tThe last argument must be the path to the mapping file"); System.exit(0); } }