org.apache.tika.parser.Parser Java Examples

The following examples show how to use org.apache.tika.parser.Parser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SolrCellBuilder.java    From kite with Apache License 2.0 6 votes vote down vote up
private Parser detectParser(Record record) {
  if (!hasAtLeastOneMimeType(record)) {
    return null;
  }
  String mediaTypeStr = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); //ExtractingParams.STREAM_TYPE);
  assert mediaTypeStr != null;
  
  MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType();
  Parser parser = mediaTypeToParserMap.get(mediaType); // fast path
  if (parser != null) {
    return parser;
  }
  // wildcard matching
  for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) {
    if (isMediaTypeMatch(mediaType, entry.getKey())) {
      return entry.getValue();
    }
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug("No supported MIME type parser found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr);
  }
  return null;
}
 
Example #2
Source File: TikaAutoMetadataExtracterTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
public void testSupports() throws Exception
{
    ArrayList<String> mimeTypes = new ArrayList<String>();
    for (Parser p : new Parser[] {
             new OfficeParser(), new OpenDocumentParser(),
             new Mp3Parser(), new OOXMLParser()
    }) {
       Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
       for (MediaType mt : mts) 
       {
          mimeTypes.add(mt.toString());
       }
    }
    
    for (String mimetype : mimeTypes)
    {
        boolean supports = extracter.isSupported(mimetype);
        assertTrue("Mimetype should be supported: " + mimetype, supports);
    }
}
 
Example #3
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
Example #4
Source File: ImageConverter.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}
 
Example #5
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example #6
Source File: TikaProcessor.java    From quarkus with Apache License 2.0 6 votes vote down vote up
public static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath,
        Optional<String> requiredParsers,
        Map<String, Map<String, String>> parserParamMaps,
        Map<String, String> parserAbbreviations) throws Exception {
    Predicate<String> pred = p -> !NOT_NATIVE_READY_PARSERS.contains(p);
    List<String> providerNames = getProviderNames(Parser.class.getName());
    if (tikaConfigPath.isPresent() || !requiredParsers.isPresent()) {
        return providerNames.stream().filter(pred).collect(Collectors.toMap(Function.identity(),
                p -> Collections.<TikaParserParameter> emptyList()));
    } else {
        List<String> abbreviations = Arrays.stream(requiredParsers.get().split(",")).map(s -> s.trim())
                .collect(Collectors.toList());
        Map<String, String> fullNamesAndAbbreviations = abbreviations.stream()
                .collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity()));
        return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p))
                .collect(Collectors.toMap(Function.identity(),
                        p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p)))));
    }
}
 
Example #7
Source File: CachingTesseractOCRParserTest.java    From extract with MIT License 5 votes vote down vote up
@Test
public void testWriteToCache() throws Throwable {
	final Path simple = Paths.get(this.simple.toURI());

	Writer writer = new StringWriter();
	final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger();

	final Parser parser = new CachingTesseractOCRParser(tmpDir) {

		private static final long serialVersionUID = 6551690243986921730L;

		@Override
		public void cacheHit() {
			hit.incrementAndGet();
		}

		@Override
		public void cacheMiss() {
			miss.incrementAndGet();
		}
	};

	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(0, hit.get());
	Assert.assertEquals(1, miss.get());

	// Try again from the cache.
	writer = new StringWriter();
	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(1, hit.get());
	Assert.assertEquals(1, miss.get());
}
 
Example #8
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}
 
Example #9
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
public String getTextWoDoublebreaks(InputStream is, Parser parser,
                                    ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new OriginalBodyContentHandler();
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}
 
Example #10
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
/**
 * Basic text extraction.
 * <p>
 * Tries to close input stream after processing.
 */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new BodyContentHandler(1000000);
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}
 
Example #11
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception {
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(handlerType, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), new ParseContext());
    }
    return handler.getMetadataList();
}
 
Example #12
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}
 
Example #13
Source File: TikaProcessor.java    From quarkus with Apache License 2.0 5 votes vote down vote up
@BuildStep
@Record(ExecutionTime.STATIC_INIT)
void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder,
        BuildProducer<ServiceProviderBuildItem> serviceProvider, TikaConfiguration configuration)
        throws Exception {
    Map<String, List<TikaParserParameter>> parsers = getSupportedParserConfig(configuration.tikaConfigPath,
            configuration.parsers,
            configuration.parserOptions, configuration.parser);
    String tikaXmlConfiguration = generateTikaXmlConfiguration(parsers);

    serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), new ArrayList<>(parsers.keySet())));
    serviceProvider
            .produce(new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
    serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(),
            getProviderNames(EncodingDetector.class.getName())));

    recorder.initTikaParser(beanContainer.getValue(), configuration, tikaXmlConfiguration);
}
 
Example #14
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, metadata, context);
    }
    return handler.getMetadataList();
}
 
Example #15
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);

    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), context);
    }
    return handler.getMetadataList();
}
 
Example #16
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception {
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), parseContext);
    }
    return handler.getMetadataList();
}
 
Example #17
Source File: ParsingReader.java    From extract with MIT License 5 votes vote down vote up
/**
 * Creates a reader for the content of the given binary stream
 * with the given document metadata. The given parser is used for the
 * parsing task that is run with the given executor.
 *
 * The created reader will be responsible for closing the given stream.
 * The stream and any associated resources will be closed at or before
 * the time when the {@link #close()} method is called on this reader.
 *
 * @param parser parser instance
 * @param input binary stream
 * @param metadata document metadata
 * @param context parsing context
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext
		context, final Function<Writer, ContentHandler> handler) throws IOException {
	final PipedReader pipedReader = new PipedReader();

	this.parser = parser;
	reader = new BufferedReader(pipedReader);

	try {
		writer = new PipedWriter(pipedReader);
	} catch (IOException e) {
		throw new IllegalStateException(e); // Should never happen.
	}

	this.input = input;
	this.metadata = metadata;
	this.context = context;

	// Generate the handler.
	this.handler = handler.apply(writer);

	parse();
	
	// TIKA-203: Buffer first character to force metadata extraction.
	reader.mark(1);

	//noinspection ResultOfMethodCallIgnored
	reader.read();
	reader.reset();
}
 
Example #18
Source File: EmbeddedDocumentMemoryExtractor.java    From extract with MIT License 5 votes vote down vote up
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}
 
Example #19
Source File: EmbeddingHTMLParsingReader.java    From extract with MIT License 5 votes vote down vote up
public EmbeddingHTMLParsingReader(final TikaDocument parent, final String open, final String close, final Parser
		parser, final TikaInputStream input, final Metadata metadata, final ParseContext context) throws
		IOException {
	super(parser, input, metadata, context, (writer)-> new SubstitutingContentHandler(parent, open, close, new
			ExpandedTitleContentHandler(new HTML5Serializer(writer))));
	this.replacer = new TokenReplacingReader((token)-> {
		final EmbeddedTikaDocument embed = parent.getEmbed(token);

		if (null == embed) {
			return null;
		}

		return DataURIEncodingInputStream.createReader(embed.getPath(), embed.getMetadata());
	}, reader, open, close);
}
 
Example #20
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
Example #21
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example #22
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
public TikaCallable(Parser parser, BaseContentExtractor contentExtractor,
        BaseLinkExtractor linkExtractor, InputStream input, Metadata metadata,
        boolean extractLanguage, ParseContext parseContext) {
    _parser = parser;
    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;
    _input = input;
    _metadata = metadata;
    _extractLanguage = extractLanguage;
    _parseContext = parseContext;
}
 
Example #23
Source File: ArchiveContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}
 
Example #24
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
protected Parser getParser() 
{
   return new CompositeParser(
         tikaConfig.getMediaTypeRegistry(), parsers
   );
}
 
Example #25
Source File: TikaSpringConfiguredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Injects the name of the Tika parser to use
 * @param className
 */
@SuppressWarnings("unchecked")
public void setTikaParserName(String className)
{
   tikaParserClassName = className;
   
   // Load the class
   try {
      tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
      setTikaParser(getParser());
   } catch(ClassNotFoundException e) {
      throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
   }
}
 
Example #26
Source File: TikaSpringConfiguredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Injects the Tika parser to use
 * @param tikaParser
 */
public void setTikaParser(Parser tikaParser)
{
   this.tikaParser = tikaParser;
   
   // Build the mime types, updating the copy our parent
   //  holds for us as we go along
   ArrayList<String> mimetypes = new ArrayList<String>();
   for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
   {
      mimetypes.add( mt.toString() );
   }
   super.setSupportedMimetypes(mimetypes);
}
 
Example #27
Source File: MP3MetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
@Override
protected Parser getParser() 
{
   return new Mp3Parser();
}
 
Example #28
Source File: DocUtils.java    From geoportal-server-harvester with Apache License 2.0 4 votes vote down vote up
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}
 
Example #29
Source File: PdfBoxMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
@Override
protected Parser getParser() 
{
   return new PDFParser();
}
 
Example #30
Source File: TikaAutoMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Does auto-detection to select the best Tika
 *  Parser.
 */
@Override
protected Parser getParser() 
{
   return parser;
}