/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.vxquery.hdfs2; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.mapred.SplitLocationInfo; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hyracks.api.client.NodeControllerInfo; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.hdfs.ContextFactory; import org.apache.hyracks.hdfs2.dataflow.FileSplitsFactory; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class HDFSFunctions { private Configuration conf; private FileSystem fs; private String confPath; private Job job; private InputFormat inputFormat; private List<InputSplit> splits; private ArrayList<ArrayList<String>> nodes; private HashMap<Integer, String> schedule; private static final String TEMP = "java.io.tmpdir"; private static final String DFS_PATH = "vxquery_splits_schedule.txt"; private static final String FILEPATH = System.getProperty(TEMP) + "splits_schedule.txt"; protected static final Logger LOGGER = Logger.getLogger(HDFSFunctions.class.getName()); private final Map<String, NodeControllerInfo> nodeControllerInfos; /** * Create the configuration and add the paths for core-site and hdfs-site as resources. * Initialize an instance of HDFS FileSystem for this configuration. * * @param nodeControllerInfos * Map of the node to its attributes * @param hdfsConf * Hdfs path to config */ public HDFSFunctions(Map<String, NodeControllerInfo> nodeControllerInfos, String hdfsConf) { this.conf = new Configuration(); this.nodeControllerInfos = nodeControllerInfos; this.confPath = hdfsConf; } /** * Create the needed objects for reading the splits of the filepath given as argument. * This method should run before the scheduleSplits method. * * @param filepath * Path to config. * @param tag * Tag to read. */ @SuppressWarnings({ "deprecation", "unchecked" }) public void setJob(String filepath, String tag) { try { conf.set("start_tag", "<" + tag + ">"); conf.set("end_tag", "</" + tag + ">"); job = new Job(conf, "Read from HDFS"); Path input = new Path(filepath); FileInputFormat.addInputPath(job, input); job.setInputFormatClass(XmlCollectionWithTagInputFormat.class); inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); splits = inputFormat.getSplits(job); } catch (IOException | ClassNotFoundException | InterruptedException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } /** * Returns true if the file path exists or it is located somewhere in the home directory of the user that called the function. * Searches in subdirectories of the home directory too. * * @param filename * HDFS file path. * @return boolean * True if located in HDFS. * @throws IOException * If searching for the filepath throws {@link IOException} */ public boolean isLocatedInHDFS(String filename) throws IllegalArgumentException, IOException { //search file path if (fs.exists(new Path(filename))) { return true; } return searchInDirectory(fs.getHomeDirectory(), filename) != null; } /** * Searches the given directory for the file. * * @param directory * to search * @param filename * of file we want * @return path if file exists in this directory.else return null. */ public Path searchInDirectory(Path directory, String filename) { //Search the files and folder in this Path to find the one matching the filename. try { RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true); String[] parts; Path path; while (it.hasNext()) { path = it.next().getPath(); parts = path.toString().split("/"); if (parts[parts.length - 1].equals(filename)) { return path; } } } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } return null; } /** * Read the cluster properties file and locate the HDFS_CONF variable that is the directory path for the * hdfs configuration if the system environment variable HDFS_CONF is not set. * * @return true if is successfully finds the Hadoop/HDFS home directory */ private boolean locateConf() { if (this.confPath == null) { //As a last resort, try getting the configuration from the system environment //Some systems won't have this set. this.confPath = System.getenv("HADOOP_CONF_DIR"); } return this.confPath != null; } /** * Upload a file/directory to HDFS.Filepath is the path in the local file system.dir is the destination path. * * @param filepath * file to upload * @param dir * HDFS directory to save the file * @return boolean */ public boolean put(String filepath, String dir) { if (this.fs != null) { Path path = new Path(filepath); Path dest = new Path(dir); try { if (fs.exists(dest)) { fs.delete(dest, true); //recursive delete } } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } try { fs.copyFromLocalFile(path, dest); } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } return false; } /** * Get instance of the HDFSfile system if it is configured correctly. * Return null if there is no instance. * * @return FileSystem */ public FileSystem getFileSystem() { if (locateConf()) { conf.addResource(new Path(this.confPath + "/core-site.xml")); conf.addResource(new Path(this.confPath + "/hdfs-site.xml")); try { fs = FileSystem.get(conf); return this.fs; } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } else { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe("Could not locate HDFS configuration folder."); } } return null; } public HashMap<String, ArrayList<Integer>> getLocationsOfSplits() throws IOException { HashMap<String, ArrayList<Integer>> splitsMap = new HashMap<>(); ArrayList<Integer> temp; int i = 0; String hostname; for (InputSplit s : this.splits) { SplitLocationInfo[] info = s.getLocationInfo(); hostname = info[0].getLocation(); if (splitsMap.containsKey(hostname)) { temp = splitsMap.get(hostname); temp.add(i); } else { temp = new ArrayList<>(); temp.add(i); splitsMap.put(hostname, temp); } i++; } return splitsMap; } public void scheduleSplits() throws IOException, ParserConfigurationException, SAXException { schedule = new HashMap<>(); ArrayList<String> empty = new ArrayList<>(); HashMap<String, ArrayList<Integer>> splitsMap = this.getLocationsOfSplits(); readNodesFromXML(); int count = this.splits.size(); String node; for (ArrayList<String> info : this.nodes) { node = info.get(1); if (splitsMap.containsKey(node)) { for (Integer split : splitsMap.get(node)) { schedule.put(split, node); count--; } splitsMap.remove(node); } else { empty.add(node); } } //Check if every split got assigned to a node if (count != 0) { ArrayList<Integer> remaining = new ArrayList<>(); // Find remaining splits for (InputSplit s : this.splits) { int i = 0; if (!schedule.containsKey(i)) { remaining.add(i); } } if (!empty.isEmpty()) { int nodeNumber = 0; for (int split : remaining) { if (nodeNumber == empty.size()) { nodeNumber = 0; } schedule.put(split, empty.get(nodeNumber)); nodeNumber++; } } } } /** * Read the hostname and the ip address of every node from the xml cluster configuration file. * Save the information inside nodes. */ public void readNodesFromXML() { nodes = new ArrayList<>(); for (NodeControllerInfo ncInfo : nodeControllerInfos.values()) { //Will this include the master node? Is that bad? ArrayList<String> info = new ArrayList<>(); info.add(ncInfo.getNodeId()); info.add(ncInfo.getNetworkAddress().getAddress()); nodes.add(info); } } /** * Writes the schedule to a temporary file, then uploads the file to the HDFS. * * @throws UnsupportedEncodingException * The encoding of the file is not correct * @throws FileNotFoundException * The file doesn't exist */ public void addScheduleToDistributedCache() throws FileNotFoundException, UnsupportedEncodingException { PrintWriter writer = new PrintWriter(FILEPATH, "UTF-8"); for (int split : this.schedule.keySet()) { writer.write(split + "," + this.schedule.get(split)); } writer.close(); // Add file to HDFS this.put(FILEPATH, DFS_PATH); } public RecordReader getReader() { List<FileSplit> fileSplits = new ArrayList<>(); for (int i = 0; i < splits.size(); i++) { fileSplits.add((FileSplit) splits.get(i)); } FileSplitsFactory splitsFactory; try { splitsFactory = new FileSplitsFactory(fileSplits); List<FileSplit> inputSplits = splitsFactory.getSplits(); ContextFactory ctxFactory = new ContextFactory(); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read the split */ TaskAttemptContext context; try { context = ctxFactory.createContext(job.getConfiguration(), i); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); return reader; } catch (IOException | InterruptedException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } } catch (HyracksDataException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } return null; } /** * @return schedule. */ public HashMap<Integer, String> getSchedule() { return this.schedule; } /** * Return the splits belonging to this node for the existing schedule. * * @param node * HDFS node * @return List */ public ArrayList<Integer> getScheduleForNode(String node) { ArrayList<Integer> nodeSchedule = new ArrayList<>(); for (int split : this.schedule.keySet()) { if (node.equals(this.schedule.get(split))) { nodeSchedule.add(split); } } return nodeSchedule; } public List<InputSplit> getSplits() { return this.splits; } public Job getJob() { return this.job; } public InputFormat getinputFormat() { return this.inputFormat; } public Document convertStringToDocument(String xmlStr) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder; try { builder = factory.newDocumentBuilder(); Document doc = builder.parse(new InputSource(new StringReader(xmlStr))); return doc; } catch (Exception e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } return null; } }