org.apache.tika.config.TikaConfig Java Examples

The following examples show how to use org.apache.tika.config.TikaConfig. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DirectoryManifest.java    From genie with Apache License 2.0 6 votes vote down vote up
ManifestVisitor(
    final Path root,
    final ImmutableMap.Builder<String, ManifestEntry> builder,
    final boolean checksumFiles,
    final Filter filter
) throws IOException {
    this.root = root;
    this.builder = builder;
    this.checksumFiles = checksumFiles;
    this.filter = filter;
    this.metadata = new Metadata();
    try {
        this.tikaConfig = new TikaConfig();
    } catch (final TikaException te) {
        log.error("Unable to create Tika Configuration due to error", te);
        throw new IOException(te);
    }
}
 
Example #2
Source File: ExtractingDocumentLoader.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
                         TikaConfig config, ParseContextConfig parseContextConfig,
                                SolrContentHandlerFactory factory) {
  this.params = req.getParams();
  this.core = req.getCore();
  this.config = config;
  this.parseContextConfig = parseContextConfig;
  this.processor = processor;

  templateAdd = new AddUpdateCommand(req);
  templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
  templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);

  //this is lightweight
  autoDetectParser = new AutoDetectParser(config);
  this.factory = factory;
  
  ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
 
Example #3
Source File: TikaDocumentItemProcessor.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
@PostConstruct
public void init() throws IOException, SAXException, TikaException{
    setFieldName(tikaFieldName);

    // load tika configuration
    tikaConfig = new TikaConfig(this.getClass().getClassLoader()
                            .getResourceAsStream("tika-config.xml"));

    // load tesseract ocr configuration
    tesseractConfig = new TesseractOCRConfig();
    if (tesseractTimeout > 0) {
        tesseractConfig.setTimeout(tesseractTimeout);
    }

    // load image magick configuration -- used for tiff conversion
    imgConfig = new ImageMagickConfig();
    if (convertTimeout > 0) {
        imgConfig.setTimeout(convertTimeout);
    }

    parser = new AutoDetectParser(tikaConfig);
}
 
Example #4
Source File: TikaAutoMetadataExtracterTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
@Override
public void setUp() throws Exception
{
    super.setUp();
    
    TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
    extracter = new TikaAutoMetadataExtracter(config);
    extracter.setDictionaryService(dictionaryService);
    extracter.register();
    
    // Attach some extra mappings, using the Tika
    //  metadata keys namespace
    // These will be tested later
    HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
          extracter.getMapping()
    );
    
    Set<QName> tlaSet = new HashSet<QName>();
    tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
    newMap.put( Metadata.CONTENT_TYPE, tlaSet );
    
    extracter.setMapping(newMap);
}
 
Example #5
Source File: TikaAutoMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
   config = tikaConfig;
   parser = new AutoDetectParser(config);

   SUPPORTED_MIMETYPES = new ArrayList<String>();
   for(MediaType mt : parser.getParsers().keySet()) 
   {
      // Add the canonical mime type
      SUPPORTED_MIMETYPES.add( mt.toString() );
      
      // And add any aliases of the mime type too - Alfresco uses some
      //  non canonical forms of various mimetypes, so we need all of them
      for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) 
      {
          SUPPORTED_MIMETYPES.add( alias.toString() );
      }
   }
   return SUPPORTED_MIMETYPES;
}
 
Example #6
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Setup
public void setup() throws Exception {
  if (spec.getTikaConfigPath() != null) {
    ResourceId configResource =
        FileSystems.matchSingleFileSpec(spec.getTikaConfigPath().get()).resourceId();
    tikaConfig = new TikaConfig(Channels.newInputStream(FileSystems.open(configResource)));
  }
}
 
Example #7
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Before
public void initConfig() {
    InputStream is = getClass().getClassLoader().getResourceAsStream("tika-config.xml");
    try {
        config = new TikaConfig(is);
    } catch (TikaException | IOException | SAXException ex) {
        Logger.getLogger(PDFPreprocessorParserTest.class.getName()).log(Level.SEVERE, null, ex);
    }
}
 
Example #8
Source File: DefaultMimeSupport.java    From nexus-public with Eclipse Public License 1.0 5 votes vote down vote up
@VisibleForTesting
public DefaultMimeSupport(final NexusMimeTypes nexusMimeTypes) {
  this.tikaConfig = TikaConfig.getDefaultConfig();
  this.detector = tikaConfig.getDetector();

  // create the cache
  extensionToMimeTypeCache =
      CacheBuilder.newBuilder().maximumSize(500).build(new CacheLoader<String, List<String>>()
      {
        @Override
        public List<String> load(final String key)
            throws Exception
        {
          final List<String> detected = Lists.newArrayList();
          final MimeRule mimeType = nexusMimeTypes.getMimeRuleForExtension(key);
          if (mimeType != null) {
            // add Nexus matches first
            detected.addAll(mimeType.getMimetypes());
            if (mimeType.isOverride()) {
              return detected;
            }
          }
          // ask Tika too
          final Metadata metadata = new Metadata();
          metadata.set(Metadata.RESOURCE_NAME_KEY, "dummy." + key);
          MediaType mediaType = detector.detect(null, metadata);
          // unravel to least specific
          unravel(detected, mediaType);
          return detected;
        }
      });
}
 
Example #9
Source File: UploadAuditSetUpFormValidator.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Control whether the uploaded files are of HTML type and whether their
 * size is under the maxFileSize limit.
 *
 * @param uploadAuditSetUpCommand
 * @param errors
 */
private void validateFiles(AuditSetUpCommand uploadAuditSetUpCommand, Errors errors) {
    boolean emptyFile = true;
    Metadata metadata = new Metadata();
    MimeTypes mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
    String mime;

    for (int i=0;i<uploadAuditSetUpCommand.getFileInputList().length;i++ ) {
        try {
            CommonsMultipartFile cmf = uploadAuditSetUpCommand.getFileInputList()[i];
            if (cmf.getSize() > maxFileSize) {
                Long maxFileSizeInMega = maxFileSize / 1000000;
                String[] arg = {maxFileSizeInMega.toString()};
                errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", FILE_SIZE_EXCEEDED_MSG_BUNDLE_KEY, arg, "{0}");
            }
            if (cmf.getSize() > 0) {
                emptyFile = false;
                mime = mimeTypes.detect(new BufferedInputStream(cmf.getInputStream()), metadata).toString();
                LOGGER.debug("mime  " + mime + "  " +cmf.getOriginalFilename());
                if (!authorizedMimeType.contains(mime)) {
                    errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY);
                }
            }
        } catch (IOException ex) {
            LOGGER.warn(ex);
            errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY);
        }
    }
    if(emptyFile) { // if no file is uploaded
        LOGGER.debug("emptyFiles");
        errors.rejectValue(GENERAL_ERROR_MSG_KEY,
                NO_FILE_UPLOADED_MSG_BUNDLE_KEY);
    }
}
 
Example #10
Source File: TikaAutoContentTransformerTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public void setUp() throws Exception
{
    super.setUp();
    
    TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
    transformer = new TikaAutoContentTransformer( config );
    transformer.setMimetypeService(mimetypeService);
    transformer.setTransformerDebug(transformerDebug);
    transformer.setTransformerConfig(transformerConfig);
    transformer.afterPropertiesSet();
}
 
Example #11
Source File: ArchiveContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}
 
Example #12
Source File: TikaPoweredContainerExtractor.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use 
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.config = tikaConfig;
    
    // Setup the detector and parser
    detector = new DefaultDetector(config.getMimeRepository());
    parser = new AutoDetectParser(detector);
}
 
Example #13
Source File: AddScenarioFormValidator.java    From Asqatasun with GNU Affero General Public License v3.0 4 votes vote down vote up
/**
 * 
 * @param addScenarioCommand
 * @param errors 
 * @return  whether the scenario handled by the current AddScenarioCommand
 * has a correct type and size
 */
public boolean checkScenarioFileTypeAndSize(
        AddScenarioCommand addScenarioCommand, 
        Errors errors) {
    if (addScenarioCommand.getScenarioFile() == null) { // if no file uploaded
        LOGGER.debug("empty Scenario File");
        errors.rejectValue(GENERAL_ERROR_MSG_KEY,
                MANDATORY_FIELD_MSG_BUNDLE_KEY);
        errors.rejectValue(SCENARIO_FILE_KEY,
                NO_SCENARIO_UPLOADED_MSG_BUNDLE_KEY);
        return false;
    }
    Metadata metadata = new Metadata();
    MimeTypes mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
    String mime;
    try {
        CommonsMultipartFile cmf = addScenarioCommand.getScenarioFile();
        if (cmf.getSize() > maxFileSize) {
            Long maxFileSizeInMega = maxFileSize / 1000000;
            String[] arg = {maxFileSizeInMega.toString()};
            errors.rejectValue(GENERAL_ERROR_MSG_KEY,
                    MANDATORY_FIELD_MSG_BUNDLE_KEY);
            errors.rejectValue(SCENARIO_FILE_KEY, FILE_SIZE_EXCEEDED_MSG_BUNDLE_KEY, arg, "{0}");
            return false;
        } else if (cmf.getSize() > 0) {
            mime = mimeTypes.detect(new BufferedInputStream(cmf.getInputStream()), metadata).toString();
            LOGGER.debug("mime  " + mime + "  " + cmf.getOriginalFilename());
            if (!authorizedMimeType.contains(mime)) {
                errors.rejectValue(GENERAL_ERROR_MSG_KEY,
                    MANDATORY_FIELD_MSG_BUNDLE_KEY);
                errors.rejectValue(SCENARIO_FILE_KEY, NOT_SCENARIO_MSG_BUNDLE_KEY);
                return false;
            }
        } else {
            LOGGER.debug("File with size null");
            errors.rejectValue(GENERAL_ERROR_MSG_KEY,
                MANDATORY_FIELD_MSG_BUNDLE_KEY);
            errors.rejectValue(SCENARIO_FILE_KEY,
                NO_SCENARIO_UPLOADED_MSG_BUNDLE_KEY);
            return false;
        }
    } catch (IOException ex) {
        LOGGER.warn(ex);
        errors.rejectValue(SCENARIO_FILE_KEY, NOT_SCENARIO_MSG_BUNDLE_KEY);
        errors.rejectValue(GENERAL_ERROR_MSG_KEY,
                MANDATORY_FIELD_MSG_BUNDLE_KEY);
        return false;
    }
    return true;
}
 
Example #14
Source File: TikaDetector.java    From spring-boot-email-tools with Apache License 2.0 4 votes vote down vote up
private TikaDetector() {
    final TikaConfig config = TikaConfig.getDefaultConfig();
    detector = config.getDetector();
}
 
Example #15
Source File: IdentifyMimeType.java    From nifi with Apache License 2.0 4 votes vote down vote up
public IdentifyMimeType() {
    this.config = TikaConfig.getDefaultConfig();
}
 
Example #16
Source File: DirectoryScanner.java    From importer-exporter with Apache License 2.0 4 votes vote down vote up
public DirectoryScanner() throws TikaException, IOException {
    tikaConfig = new TikaConfig();
    contentFile = Pattern.compile("(?i).+\\.((gml)|(xml)|(json)|(gz)|(gzip))$");
    matcher = Pattern.compile("").matcher("");
}
 
Example #17
Source File: TikaIntegrationTest.java    From wildfly-camel with Apache License 2.0 4 votes vote down vote up
private static BeanRepository createRegistryWithEmptyConfig() throws Exception {
  	Context jndiContext = createJndiContext();
  	jndiContext.bind("testConfig", new TikaConfig(new File("src/test/resources/tika/tika-empty.xml")));
JndiBeanRepository repository = new JndiBeanRepository(jndiContext);
      return repository;
  }
 
Example #18
Source File: MimetypeMap.java    From alfresco-data-model with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Initialises the map using the configuration service provided
 */
public void init()
{
    PropertyCheck.mandatory(this, "configService", configService);
    PropertyCheck.mandatory(this, "contentCharsetFinder", contentCharsetFinder);

    // Do we have any properties that indicate we will read JSON?
    if (mimetypeJsonConfigDir != null || jsonObjectMapper != null || cronExpression != null || initialAndOnErrorCronExpression != null)
    {
        PropertyCheck.mandatory(this, "jsonObjectMapper", jsonObjectMapper);
        // If we have a cronExpression it indicates that we will schedule reading.
        if (cronExpression != null)
        {
            PropertyCheck.mandatory(this, "initialAndOnErrorCronExpression", initialAndOnErrorCronExpression);
        }
        jsonConfigFileFinder = new ConfigFileFinder(jsonObjectMapper)
        {
            @Override
            protected void readJson(JsonNode jsonNode, String readFromMessage, String baseUrl) throws IOException
            {
                try
                {
                    JsonNode mediaTypes = jsonNode.get("mediaTypes");
                    if (mediaTypes != null && mediaTypes.isArray())
                    {
                        List<ConfigElement> mimetypes = new ArrayList<>();
                        for (JsonNode mediaType : mediaTypes)
                        {
                            MediaTypeDef def = jsonObjectMapper.convertValue(mediaType, MediaTypeDef.class);
                            GenericConfigElement mimetype = new GenericConfigElement(ATTR_MIMETYPE);
                            mimetype.addAttribute(ATTR_DISPLAY, def.name);
                            mimetype.addAttribute(ATTR_MIMETYPE, def.mediaType);
                            if (def.text)
                            {
                                mimetype.addAttribute(ATTR_TEXT, Boolean.TRUE.toString());
                            }

                            GenericConfigElement ext = null;
                            int count = 0;
                            for (ExtensionDef extension : def.extensions)
                            {
                                ext = new GenericConfigElement(ATTR_EXTENSION);
                                ext.setValue(extension.extension);
                                if (extension.name != null && !extension.name.isBlank())
                                {
                                    ext.addAttribute(ATTR_DISPLAY, extension.name);
                                }
                                if (extension.isDefault)
                                {
                                    ext.addAttribute(ATTR_DEFAULT, Boolean.TRUE.toString());
                                }
                                mimetype.addChild(ext);
                                count++;
                            }
                            if (count == 1 && ext.getAttribute(ATTR_DEFAULT) == null)
                            {
                                ext.addAttribute(ATTR_DEFAULT, Boolean.TRUE.toString());
                            }
                            mimetypes.add(mimetype);
                        }
                        registerMimetypes(mimetypes);
                        Data data = getData();
                        data.fileCount++;
                    }
                }
                catch (IllegalArgumentException e)
                {
                    logger.error("Error reading "+readFromMessage+" "+e.getMessage());
                }
            }
        };
    }

    // TikaConfig should be given, but work around it if not
    if (tikaConfig == null)
    {
        logger.warn("TikaConfig spring parameter not supplied, using default config");
        setTikaConfig(TikaConfig.getDefaultConfig());
    }
    // Create our Tika mimetype detector up-front
    // We can then be sure we only have the one, so it's quick (ALF-10813)
    detector = new DefaultDetector(tikaConfig.getMimeRepository());

    // Work out the mappings - only runs once and straight away if cronExpression is null
    configScheduler.run(true, logger, cronExpression, initialAndOnErrorCronExpression);
}
 
Example #19
Source File: IdentifyMimeType.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
public IdentifyMimeType() {
    // Setup Tika
    this.config = TikaConfig.getDefaultConfig();
    this.detector = config.getDetector();
}
 
Example #20
Source File: TikaAutoMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
public TikaAutoMetadataExtracter(TikaConfig tikaConfig)
{
   super( buildMimeTypes(tikaConfig) );
}
 
Example #21
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
public void setTikaConfig(TikaConfig tikaConfig)
{
   this.tikaConfig = tikaConfig;
}
 
Example #22
Source File: TikaAutoContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
public TikaAutoContentTransformer(TikaConfig tikaConfig)
{
   super( buildMimeTypes(tikaConfig) );
   setUseTimeoutThread(true);
    setTransformerName("TikaAuto");
}
 
Example #23
Source File: TikaAutoContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
   config = tikaConfig;
   parser = new AutoDetectParser(config);

   SUPPORTED_MIMETYPES = new ArrayList<String>();
   for(MediaType baseType : parser.getParsers().keySet()) 
   {
      // Register both the canonical type, and any alias it may have
      // Alfresco sometimes uses the canonical type, and sometimes an alias
      ArrayList<MediaType> types = new ArrayList<MediaType>();
      types.add(baseType);
      types.addAll( config.getMediaTypeRegistry().getAliases(baseType) );
       
      for(MediaType mt : types) 
      {
          if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) {
              // TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula
              // TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template
              continue;
          }
          if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) {
              // TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics
              // TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template
              continue;
          }

          if(mt.getType().equals("image") ||
                  mt.getType().equals("audio") ||
                  mt.getType().equals("video")) 
          {
              // Skip these, as Tika mostly just does
              //  metadata rather than content
          }
          else if(mt.toString().equals("application/zip") ||
                  mt.toString().equals("application/tar") || 
                  mt.toString().equals("application/x-tar"))
          {
              // Skip these, as we handle container formats in a different
              //  transformer to give the user control over recursion
          }
          else if(mt.toString().equals("message/rfc822") ||
                  mt.toString().equals("application/vnd.ms-outlook"))
          {
              // Skip these, as we want our textual representations to include
              //  parts of the metadata (eg people, subjects, dates) too
          }
          else
          {
              // Tika can probably do some useful text
              SUPPORTED_MIMETYPES.add( mt.toString() );
          }
      }
   }
   return SUPPORTED_MIMETYPES;
}
 
Example #24
Source File: MimetypeMap.java    From alfresco-data-model with GNU Lesser General Public License v3.0 2 votes vote down vote up
/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.tikaConfig = tikaConfig;
}
 
Example #25
Source File: TikaProcessor.java    From jesterj with Apache License 2.0 2 votes vote down vote up
/**
 * Specify a tika configuration via an XML document you have loaded via filesystem/classpath or other method
 * of your choice.
 *
 * @param config The configuration
 * @return This builder for further config
 * @throws TikaException if Tika doesn't like your config
 * @throws IOException if Tika can't find something it needed?
 */
public Builder configuredWith(org.w3c.dom.Document config) throws TikaException, IOException {
  getObj().tikaConfig = new TikaConfig(config);
  return this;
}
 
Example #26
Source File: ArchiveContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 2 votes vote down vote up
/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use 
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.tikaConfig = tikaConfig;
}
 
Example #27
Source File: HTMLRenderingEngine.java    From alfresco-repository with GNU Lesser General Public License v3.0 2 votes vote down vote up
/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use 
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.tikaConfig = tikaConfig;
}