package file; import com.monitorjbl.xlsx.StreamingReader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.OfficeXmlFileException; import org.apache.poi.ss.usermodel.*; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Scanner; public class FileBeanParser { final static Logger logger = LoggerFactory.getLogger(FileBeanParser.class); public static FileBean getFileBean(File file) throws Exception { FileBean fileBean = new FileBean(); String filepath = file.getAbsolutePath(); String content; InputStream is = getInputStream(filepath); content = getContent(filepath, is); fileBean.setFilepath(filepath); fileBean.setLastModified(file.lastModified()); fileBean.setContent(content); return fileBean; } private static InputStream getInputStream(String filepath) throws Exception { return new FileInputStream(filepath); } private static String getContent(String filepath, InputStream is) throws Exception { String content = ""; if (filepath.endsWith(".doc") || filepath.endsWith(".docx")) { content = readDoc(filepath, is); } else if (filepath.endsWith(".xls") || filepath.endsWith(".xlsx")) { content = readExcel(is); } else if (filepath.endsWith(".pdf")) { content = readPdf(is); } else if (filepath.endsWith(".txt")) { content = readTxt(is); } return content; } public static FileMagic getFileMagic(InputStream is) throws IOException { is = FileMagic.prepareToCheckMagic(is); return FileMagic.valueOf(is); } @SuppressWarnings("deprecation" ) private static String readExcel(InputStream inp) throws Exception { Workbook wb; StringBuilder sb = new StringBuilder(); boolean isXls = false; if (getFileMagic(inp) == FileMagic.OLE2) { wb = new HSSFWorkbook(inp); isXls = true; } else { wb = StreamingReader.builder() .rowCacheSize(1000) // number of rows to keep in memory (defaults to 10) .bufferSize(4096) // buffer size to use when reading InputStream to file (defaults to 1024) .open(inp); // InputStream or File for XLSX file (required) } sb = readSheet(wb, sb, isXls); wb.close(); if (inp != null) { inp.close(); } return sb.toString(); } @SuppressWarnings({ "static-access", "deprecation" }) private static StringBuilder readSheet(Workbook wb, StringBuilder sb, boolean isXls) throws Exception { for (Sheet sheet: wb) { for (Row r: sheet) { for (Cell cell: r) { if (cell.getCellType() == Cell.CELL_TYPE_STRING) { sb.append(cell.getStringCellValue()); sb.append(" "); } else if (cell.getCellType() == Cell.CELL_TYPE_NUMERIC) { if (isXls) { DataFormatter formatter = new DataFormatter(); sb.append(formatter.formatCellValue(cell)); } else { sb.append(cell.getStringCellValue()); } sb.append(" "); } } } } return sb; } private static String readDoc (String filePath, InputStream is) throws Exception { String text= ""; is = FileMagic.prepareToCheckMagic(is); try { if (FileMagic.valueOf(is) == FileMagic.OLE2) { WordExtractor ex = new WordExtractor(is); text = ex.getText(); ex.close(); } else if(FileMagic.valueOf(is) == FileMagic.OOXML) { XWPFDocument doc = new XWPFDocument(is); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); text = extractor.getText(); extractor.close(); } } catch (OfficeXmlFileException e) { logger.error(filePath, e); } finally { if (is != null) { is.close(); } } return text; } private static String readTxt(InputStream is) throws Exception { StringBuilder sb = new StringBuilder(); Scanner s = new Scanner(is); while (s.hasNext()) { sb.append(s.next()); } if (is != null) { is.close(); } return sb.toString(); } private static String readPdf(InputStream is) throws Exception { String result; PDDocument doc = PDDocument.load(is); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(doc); if(doc!= null) { doc.close(); } if (is != null) { is.close(); } return result; } }