org.apache.poi.xwpf.extractor.XWPFWordExtractor Java Examples

The following examples show how to use org.apache.poi.xwpf.extractor.XWPFWordExtractor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileBeanParser.java    From everywhere with Apache License 2.0 6 votes vote down vote up
private static String readDoc (String filePath, InputStream is) throws Exception {
    String text= "";
    is = FileMagic.prepareToCheckMagic(is);
    try {
        if (FileMagic.valueOf(is) == FileMagic.OLE2) {
            WordExtractor ex = new WordExtractor(is);
            text = ex.getText();
            ex.close();
        } else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
            XWPFDocument doc = new XWPFDocument(is);
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            text = extractor.getText();
            extractor.close();
        }
    } catch (OfficeXmlFileException e) {
        logger.error(filePath, e);
    } finally {
        if (is != null) {
            is.close();
        }
    }
    return text;
}
 
Example #2
Source File: IndexerTextExtractor.java    From eplmp with Eclipse Public License 1.0 6 votes vote down vote up
private String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
    String strRet;

    try (InputStream wordStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            strRet = wordExtractor.getText();
            wordExtractor.close();
        } else {
            XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
            strRet = wordXExtractor.getText();
            wordXExtractor.close();
        }
    }

    return strRet;
}
 
Example #3
Source File: M2DocTestUtils.java    From M2Doc with Eclipse Public License 1.0 6 votes vote down vote up
/**
 * Gets the textual element of the .docx at the given {@link URI}.
 * 
 * @param uriConverter
 *            the {@link URIConverter}
 * @param uri
 *            the .docx {@link URI}
 * @return the textual element of the .docx at the given {@link URI}
 */
public static String getTextContent(URIConverter uriConverter, URI uri) {
    String result = "";

    try (InputStream is = uriConverter.createInputStream(uri);
            OPCPackage oPackage = OPCPackage.open(is);
            XWPFDocument document = new XWPFDocument(oPackage);
            XWPFWordExtractor ex = new XWPFWordExtractor(document);) {

        result += "===== Document Text ====\n";
        result += ex.getText();
        // CHECKSTYLE:OFF
    } catch (Throwable e) {
        // CHECKSTYLE:ON
        /*
         * if for some reason we can't use POI to get the text content then move along, we'll still get the XML and hashs
         */
    }
    return result;
}
 
Example #4
Source File: OfficeUtils.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
public static String Parse07(String FilePath) throws IOException, XmlException, OpenXML4JException{
    String text2007=null;
    try{
        OPCPackage opcPackage = POIXMLDocument.openPackage(FilePath);
        POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
        text2007 = extractor.getText();

    } catch (Exception e) {
        e.printStackTrace();
    }
    return text2007;
}
 
Example #5
Source File: WordUtil.java    From javatech with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
/**
 * 文本提取
 *
 * @param filename
 * @throws IOException
 */
public static void extractor(String filename) throws IOException {
    XWPFDocument docx = new XWPFDocument(new FileInputStream(filename));
    // using XWPFWordExtractor Class
    XWPFWordExtractor we = new XWPFWordExtractor(docx);
    System.out.println(we.getText());
}
 
Example #6
Source File: OOXMLWordFormatModule.java    From ontopia with Apache License 2.0 5 votes vote down vote up
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  try {
    OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent()));
    XWPFWordExtractor extractor = new XWPFWordExtractor(opc);
    String s = extractor.getText();
    char[] c = s.toCharArray();
    handler.startRegion("document");
    handler.text(c, 0, c.length);
    handler.endRegion();
  } catch (Exception e) {
    throw new OntopiaRuntimeException(e);
  }    
}
 
Example #7
Source File: MSWordIndexerTest.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class);
    WordExtractor wordExtractor = Mockito.mock(WordExtractor.class);
    XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class);
    XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any(InputStream.class))
            .thenThrow(OfficeXmlFileException.class)
            .thenReturn(poiFS)
            .thenThrow(APIManagementException.class);
    PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor);
    PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any())
            .thenReturn(xwpfDocument);
    PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor);
    Mockito.when(wordExtractor.getText()).thenReturn("");
    Mockito.when(xwpfExtractor.getText()).thenReturn("");
    MSWordIndexer indexer = new MSWordIndexer();

    IndexDocument wordDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}
 
Example #8
Source File: MSOfficeBox.java    From wandora with GNU General Public License v3.0 4 votes vote down vote up
public static String getDocxText(File file) {
    try {
        XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String text = extractor.getText();
        return text;
    }
    catch(Exception e) {
        e.printStackTrace();
    }
    return null;
}