org.apache.poi.hssf.extractor.ExcelExtractor Java Examples

The following examples show how to use org.apache.poi.hssf.extractor.ExcelExtractor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OLE2ExtractorFactory.java    From lams with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Create the Extractor, if possible. Generally needs the Scratchpad jar.
 * Note that this won't check for embedded OOXML resources either, use
 *  {@link org.apache.poi.extractor.ExtractorFactory} for that.
 */
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
    // Look for certain entries in the stream, to figure it
    // out from
    for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
        if (poifsDir.hasEntry(workbookName)) {
            if (getPreferEventExtractor()) {
                return new EventBasedExcelExtractor(poifsDir);
            }
            return new ExcelExtractor(poifsDir);
        }
    }
    if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
        throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
                + "found. Please call OldExcelExtractor directly for basic text extraction");
    }
    
    // Ask Scratchpad, or fail trying
    Class<?> cls = getScratchpadClass();
    try {
        Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
        POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
        if (ext != null) return ext;
    } catch (IllegalArgumentException iae) {
        throw iae;
    } catch (Exception e) {
        throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
    }

    throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
 
Example #2
Source File: IndexerTextExtractor.java    From eplmp with Eclipse Public License 1.0 5 votes vote down vote up
private String microsoftExcelDocumentToString(InputStream inputStream) throws IOException, OpenXML4JException, XmlException {
    StringBuilder sb = new StringBuilder();

    try (InputStream excelStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(excelStream)) { // Before 2007 format files
            POIFSFileSystem excelFS = new POIFSFileSystem(excelStream);
            ExcelExtractor excelExtractor = new ExcelExtractor(excelFS);
            sb.append(excelExtractor.getText());
            excelExtractor.close();
        } else { // New format
            XSSFWorkbook workBook = new XSSFWorkbook(excelStream);
            int numberOfSheets = workBook.getNumberOfSheets();
            for (int i = 0; i < numberOfSheets; i++) {
                XSSFSheet sheet = workBook.getSheetAt(0);
                Iterator<Row> rowIterator = sheet.rowIterator();
                while (rowIterator.hasNext()) {
                    XSSFRow row = (XSSFRow) rowIterator.next();
                    Iterator<Cell> cellIterator = row.cellIterator();
                    while (cellIterator.hasNext()) {
                        XSSFCell cell = (XSSFCell) cellIterator.next();
                        sb.append(cell.toString());
                        sb.append(" ");
                    }
                    sb.append("\n");
                }
                sb.append("\n");
            }
        }
    }

    return sb.toString();
}
 
Example #3
Source File: MyExcelUtils.java    From spring-boot with Apache License 2.0 5 votes vote down vote up
/**
 * 利用 POI 提供的工具,提取文件内容为字符串
 *
 * @param excelFile 待提取的 excel 文件
 * @return
 */
public String excelExtractor(File excelFile) {

    try {
        HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(excelFile));
        ExcelExtractor extractor = new ExcelExtractor(wb);
        extractor.setFormulasNotResults(true);
        extractor.setIncludeSheetNames(true);
        return extractor.getText();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        return null;
    }
}
 
Example #4
Source File: MsExcelTextExtractor.java    From document-management-system with GNU General Public License v2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
	try {
		POIFSFileSystem fs = new POIFSFileSystem(stream);
		return new ExcelExtractor(fs).getText();
	} catch (RuntimeException e) {
		logger.warn("Failed to extract Excel text content", e);
		throw new IOException(e.getMessage(), e);
	} finally {
		stream.close();
	}
}
 
Example #5
Source File: MSExcelIndexerTest.java    From carbon-apimgt with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {
    excelExtractor = Mockito.mock(ExcelExtractor.class);
    xssfExtractor = Mockito.mock(XSSFExcelExtractor.class);
    msExcelIndexer = new MSExcelIndexerWrapper(xssfExtractor, excelExtractor);
    file2Index = new AsyncIndexer.File2Index("".getBytes(),
            "", "", -1234, "");
}
 
Example #6
Source File: MetadataExtractor.java    From document-management-system with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Extract metadata from Office Word
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(is);
	OfficeMetadata md = new OfficeMetadata();
	SummaryInformation si = null;

	if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
		si = new WordExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
		si = new ExcelExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
		si = new PowerPointExtractor(fs).getSummaryInformation();
	}

	if (si != null) {
		md.setTitle(si.getTitle());
		md.setSubject(si.getSubject());
		md.setAuthor(si.getAuthor());
		md.setLastAuthor(si.getLastAuthor());
		md.setKeywords(si.getKeywords());
		md.setComments(si.getComments());
		md.setTemplate(si.getTemplate());
		md.setRevNumber(si.getRevNumber());
		md.setApplicationName(si.getApplicationName());
		md.setEditTime(si.getEditTime());
		md.setPageCount(si.getPageCount());
		md.setWordCount(si.getWordCount());
		md.setCharCount(si.getCharCount());
		md.setSecurity(si.getSecurity());

		Calendar createDateTime = Calendar.getInstance();
		createDateTime.setTime(si.getCreateDateTime());
		md.setCreateDateTime(createDateTime);

		Calendar lastSaveDateTime = Calendar.getInstance();
		lastSaveDateTime.setTime(si.getLastSaveDateTime());
		md.setLastSaveDateTime(lastSaveDateTime);

		Calendar lastPrinted = Calendar.getInstance();
		lastPrinted.setTime(si.getLastPrinted());
		md.setLastPrinted(lastPrinted);
	}

	log.info("officeExtractor: {}", md);
	return md;
}
 
Example #7
Source File: ExcelOOXMLDocument.java    From olat with Apache License 2.0 4 votes vote down vote up
private void extractHeaderFooter(final StringBuilder buffy, final HeaderFooter hf) {
    final String content = ExcelExtractor._extractHeaderFooter(hf);
    if (content.length() > 0) {
        buffy.append(content).append(' ');
    }
}
 
Example #8
Source File: MSExcelIndexer.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
protected ExcelExtractor getExcelExtractor(File2Index fileData) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
	return new ExcelExtractor(fs);
}
 
Example #9
Source File: MSExcelIndexerWrapper.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
public MSExcelIndexerWrapper(XSSFExcelExtractor xssfExtractor, ExcelExtractor excelExtractor) {
    this.xssfExcelExtractor = xssfExtractor;
    this.excelExtractor = excelExtractor;
}
 
Example #10
Source File: MSExcelIndexerWrapper.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
@Override
protected ExcelExtractor getExcelExtractor(AsyncIndexer.File2Index fileData) throws IOException {
    return this.excelExtractor;
}