/** * Copyright 2017 ZuInnoTe (Jörn Franke) <[email protected]> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package org.zuinnote.hadoop.office.example; import org.zuinnote.hadoop.office.example.driver.CSV2ExcelDriver; import org.zuinnote.hadoop.office.format.common.HadoopOfficeReadConfiguration; import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO; import org.zuinnote.hadoop.office.format.common.parser.FormatNotUnderstoodException; import org.zuinnote.hadoop.office.format.common.parser.msexcel.MSExcelParser; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.BufferedReader; import java.io.File; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.Files; import java.nio.file.FileVisitResult; import java.nio.file.SimpleFileVisitor; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.SplittableCompressionCodec; import org.apache.hadoop.io.compress.SplitCompressionInputStream; import org.apache.hadoop.mapreduce.v2.MiniMRYarnCluster; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; public final class MapReduceExcelOutputIntegrationTest { private static final String tmpPrefix = "hcl-integrationtest"; private static java.nio.file.Path tmpPath; private static String CLUSTERNAME="hcl-minicluster"; private static String DFS_INPUT_DIR_NAME = "/input"; private static String DFS_OUTPUT_DIR_NAME = "/output"; private static String DEFAULT_OUTPUT_FILENAME = "part-r-00000"; private static String DEFAULT_OUTPUT_EXCEL_FILENAME = "part-r-00000.xlsx"; private static Path DFS_INPUT_DIR = new Path(DFS_INPUT_DIR_NAME); private static Path DFS_OUTPUT_DIR = new Path(DFS_OUTPUT_DIR_NAME); private static int NOOFNODEMANAGERS=1; private static int NOOFDATANODES=4; private static boolean STARTTIMELINESERVER=true; private static MiniDFSCluster dfsCluster; private static MiniMRYarnCluster miniCluster; private ArrayList<Decompressor> openDecompressors = new ArrayList<>(); @BeforeAll public static void oneTimeSetUp() throws IOException { // Create temporary directory for HDFS base and shutdownhook // create temp directory tmpPath = Files.createTempDirectory(tmpPrefix); // create shutdown hook to remove temp files (=HDFS MiniCluster) after shutdown, may need to rethink to avoid many threads are created Runtime.getRuntime().addShutdownHook(new Thread( new Runnable() { @Override public void run() { try { Files.walkFileTree(tmpPath, new SimpleFileVisitor<java.nio.file.Path>() { @Override public FileVisitResult visitFile(java.nio.file.Path file,BasicFileAttributes attrs) throws IOException { Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult postVisitDirectory(java.nio.file.Path dir, IOException e) throws IOException { if (e == null) { Files.delete(dir); return FileVisitResult.CONTINUE; } throw e; } }); } catch (IOException e) { throw new RuntimeException("Error temporary files in following path could not be deleted "+tmpPath, e); } }})); // Create Configuration Configuration conf = new Configuration(); // create HDFS cluster File baseDir = new File(tmpPath.toString()).getAbsoluteFile(); conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf); dfsCluster = builder.numDataNodes(NOOFDATANODES).build(); // create Yarn cluster YarnConfiguration clusterConf = new YarnConfiguration(conf); conf.set("fs.defaultFS", dfsCluster.getFileSystem().getUri().toString()); conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 64); conf.setClass(YarnConfiguration.RM_SCHEDULER,FifoScheduler.class, ResourceScheduler.class); miniCluster = new MiniMRYarnCluster(CLUSTERNAME, NOOFNODEMANAGERS, STARTTIMELINESERVER); miniCluster.init(conf); miniCluster.start(); } @AfterAll public static void oneTimeTearDown() { // destroy Yarn cluster miniCluster.stop(); // destroy HDFS cluster dfsCluster.shutdown(); } @BeforeEach public void setUp() throws IOException { // create input directory dfsCluster.getFileSystem().mkdirs(DFS_INPUT_DIR); } @AfterEach public void tearDown() throws IOException { // Remove input and output directory dfsCluster.getFileSystem().delete(DFS_INPUT_DIR,true); dfsCluster.getFileSystem().delete(DFS_OUTPUT_DIR,true); // close any open decompressor for (Decompressor currentDecompressor: this.openDecompressors) { if (currentDecompressor!=null) { CodecPool.returnDecompressor(currentDecompressor); } } } @Test public void checkTestDataCSVAvailable() { ClassLoader classLoader = getClass().getClassLoader(); String fileName="simplecsv.csv"; String fileNameCSV=classLoader.getResource(fileName).getFile(); assertNotNull(fileNameCSV, "Test Data File \""+fileName+"\" is not null in resource path"); File file = new File(fileNameCSV); assertTrue(file.exists(), "Test Data File \""+fileName+"\" exists"); assertFalse(file.isDirectory(), "Test Data File \""+fileName+"\" is not a directory"); } @Test public void mapReduceCSV2Excel() throws IOException, Exception { ClassLoader classLoader = getClass().getClassLoader(); // put testdata on DFS String fileName="simplecsv.csv"; String fileNameFullLocal=classLoader.getResource(fileName).getFile(); Path inputFile=new Path(fileNameFullLocal); dfsCluster.getFileSystem().copyFromLocalFile(false, false, inputFile, DFS_INPUT_DIR); // submit the application miniCluster.getConfig().set("hadoopoffice.read.locale.bcp47","us"); // Let ToolRunner handle generic command-line options int res = ToolRunner.run(miniCluster.getConfig(), new CSV2ExcelDriver(), new String[]{DFS_INPUT_DIR_NAME,DFS_OUTPUT_DIR_NAME}); // check if successfully executed // note the following does only work on Linux platforms, other platforms may show issue due to the Hadoop Unit testing framework only supports Linux // You can remove this test if you work on another platform. The application itself builds and run on a real cluster without any issues. assertEquals( 0, res, "Successfully executed mapreduce application"); // fetch results List<SpreadSheetCellDAO[]> resultLines = readDefaultExcelResults(2); // compare results assertEquals(2,resultLines.size(), "Number of result line is 2"); assertEquals("1",resultLines.get(0)[0].getFormattedValue(), "Cell A1 has value 1"); assertEquals("2",resultLines.get(0)[1].getFormattedValue(), "Cell B1 has value 2"); assertEquals("3",resultLines.get(0)[2].getFormattedValue(), "Cell C1 has value 3"); assertEquals("4",resultLines.get(0)[3].getFormattedValue(), "Cell D1 has value 4"); assertEquals("test1",resultLines.get(1)[0].getFormattedValue(),"Cell A2 has value test1"); assertEquals("test2",resultLines.get(1)[1].getFormattedValue(), "Cell B2 has value test2"); assertEquals("test3",resultLines.get(1)[2].getFormattedValue(), "Cell C2 has value test3"); assertEquals("test4",resultLines.get(1)[3].getFormattedValue(), "Cell D2 has value test4"); } /** * Read excel files from the default output directory and default excel outputfile * @throws FormatNotUnderstoodException * * */ private List<SpreadSheetCellDAO[]> readDefaultExcelResults(int numOfRows) throws IOException, FormatNotUnderstoodException { ArrayList<SpreadSheetCellDAO[]> result = new ArrayList<>(); Path defaultOutputfile = new Path(DFS_OUTPUT_DIR_NAME+"/"+DEFAULT_OUTPUT_EXCEL_FILENAME); InputStream defaultInputStream = openFile(defaultOutputfile); // Create a new MS Excel Parser HadoopOfficeReadConfiguration hocr = new HadoopOfficeReadConfiguration(); MSExcelParser excelParser = new MSExcelParser(hocr,null); excelParser.parse(defaultInputStream); for (int i=0;i<numOfRows;i++) { SpreadSheetCellDAO[] currentRow = (SpreadSheetCellDAO[]) excelParser.getNext(); if (currentRow!=null) { result.add(currentRow); } } excelParser.close(); return result; } /** * Read results from the default output directory and default outputfile name * * @param numOfRows number of rows to read * */ private List<String> readDefaultResults(int numOfRows) throws IOException { ArrayList<String> result = new ArrayList<>(); Path defaultOutputfile = new Path(DFS_OUTPUT_DIR_NAME+"/"+DEFAULT_OUTPUT_FILENAME); InputStream defaultInputStream = openFile(defaultOutputfile); BufferedReader reader=new BufferedReader(new InputStreamReader(defaultInputStream)); int i=0; while(reader.ready()) { if (i==numOfRows) { break; } result.add(reader.readLine()); i++; } reader.close(); return result; } /* * Opens a file using the Hadoop API. It supports uncompressed and compressed files. * * @param path path to the file, e.g. file://path/to/file for a local file or hdfs://path/to/file for HDFS file. All filesystem configured for Hadoop can be used * * @return InputStream from which the file content can be read * * @throws java.io.Exception in case there is an issue reading the file * * */ private InputStream openFile(Path path) throws IOException { CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path); FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path); // check if compressed if (codec==null) { // uncompressed return fileIn; } else { // compressed Decompressor decompressor = CodecPool.getDecompressor(codec); this.openDecompressors.add(decompressor); // to be returned later using close if (codec instanceof SplittableCompressionCodec) { long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS); return cIn; } else { return codec.createInputStream(fileIn,decompressor); } } } }