Java Code Examples for org.apache.poi.hwpf.extractor.WordExtractor

The following examples show how to use org.apache.poi.hwpf.extractor.WordExtractor. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: everywhere   Source File: FileBeanParser.java    License: Apache License 2.0 6 votes vote down vote up
private static String readDoc (String filePath, InputStream is) throws Exception {
    String text= "";
    is = FileMagic.prepareToCheckMagic(is);
    try {
        if (FileMagic.valueOf(is) == FileMagic.OLE2) {
            WordExtractor ex = new WordExtractor(is);
            text = ex.getText();
            ex.close();
        } else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
            XWPFDocument doc = new XWPFDocument(is);
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            text = extractor.getText();
            extractor.close();
        }
    } catch (OfficeXmlFileException e) {
        logger.error(filePath, e);
    } finally {
        if (is != null) {
            is.close();
        }
    }
    return text;
}
 
Example 2
Source Project: eplmp   Source File: IndexerTextExtractor.java    License: Eclipse Public License 1.0 6 votes vote down vote up
private String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
    String strRet;

    try (InputStream wordStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            strRet = wordExtractor.getText();
            wordExtractor.close();
        } else {
            XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
            strRet = wordXExtractor.getText();
            wordXExtractor.close();
        }
    }

    return strRet;
}
 
Example 3
Source Project: dk-fitting   Source File: OfficeUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static String word2Text(File f) throws Exception {
    FileInputStream in = new FileInputStream (f);
    WordExtractor extractor = new WordExtractor(in);
    String str = extractor.getText();
    System.out.println("the result length is"+str.length());
    return str;
}
 
Example 4
Source Project: dk-fitting   Source File: OfficeUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static String Parse03(String FilePath) throws IOException{
    String text2003=null;
    InputStream is = null;
    try {
        is = new FileInputStream(new File(FilePath));
        WordExtractor ex = new WordExtractor(is);
        text2003 = ex.getText();

    } catch (Exception e) {
        e.printStackTrace();
    }finally{
        is.close();
    }
    return text2003;
}
 
Example 5
/**
 * {@inheritDoc} Returns an empty reader if an error occured extracting text from
 * the word document.
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
	try {
		return new WordExtractor(stream).getText();
	} catch (Exception e) {
		logger.warn("Failed to extract Word text content", e);
		throw new IOException(e.getMessage(), e);
	} finally {
		stream.close();
	}
}
 
Example 6
Source Project: ontopia   Source File: WordFormatModule.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  try {
    WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
    String s = extractor.getText();
    char[] c = s.toCharArray();
    handler.startRegion("document");
    handler.text(c, 0, c.length);
    handler.endRegion();
  } catch (Exception e) {
    throw new OntopiaRuntimeException(e);
  }    
}
 
Example 7
/**
 * Extract metadata from Office Word
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(is);
	OfficeMetadata md = new OfficeMetadata();
	SummaryInformation si = null;

	if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
		si = new WordExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
		si = new ExcelExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
		si = new PowerPointExtractor(fs).getSummaryInformation();
	}

	if (si != null) {
		md.setTitle(si.getTitle());
		md.setSubject(si.getSubject());
		md.setAuthor(si.getAuthor());
		md.setLastAuthor(si.getLastAuthor());
		md.setKeywords(si.getKeywords());
		md.setComments(si.getComments());
		md.setTemplate(si.getTemplate());
		md.setRevNumber(si.getRevNumber());
		md.setApplicationName(si.getApplicationName());
		md.setEditTime(si.getEditTime());
		md.setPageCount(si.getPageCount());
		md.setWordCount(si.getWordCount());
		md.setCharCount(si.getCharCount());
		md.setSecurity(si.getSecurity());

		Calendar createDateTime = Calendar.getInstance();
		createDateTime.setTime(si.getCreateDateTime());
		md.setCreateDateTime(createDateTime);

		Calendar lastSaveDateTime = Calendar.getInstance();
		lastSaveDateTime.setTime(si.getLastSaveDateTime());
		md.setLastSaveDateTime(lastSaveDateTime);

		Calendar lastPrinted = Calendar.getInstance();
		lastPrinted.setTime(si.getLastPrinted());
		md.setLastPrinted(lastPrinted);
	}

	log.info("officeExtractor: {}", md);
	return md;
}
 
Example 8
Source Project: carbon-apimgt   Source File: MSWordIndexerTest.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class);
    WordExtractor wordExtractor = Mockito.mock(WordExtractor.class);
    XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class);
    XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any(InputStream.class))
            .thenThrow(OfficeXmlFileException.class)
            .thenReturn(poiFS)
            .thenThrow(APIManagementException.class);
    PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor);
    PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any())
            .thenReturn(xwpfDocument);
    PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor);
    Mockito.when(wordExtractor.getText()).thenReturn("");
    Mockito.when(xwpfExtractor.getText()).thenReturn("");
    MSWordIndexer indexer = new MSWordIndexer();

    IndexDocument wordDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}