org.apache.tika.Tika Java Examples

The following examples show how to use org.apache.tika.Tika. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClient.java    From AuTe-Framework with Apache License 2.0 6 votes vote down vote up
private MultipartEntityBuilder setEntity(List<FormData> formDataList, String projectPath, Map<String, Object> scenarioVariables) throws IOException {
    MultipartEntityBuilder entity = MultipartEntityBuilder.create().setMode(HttpMultipartMode.BROWSER_COMPATIBLE);
    for (FormData formData : formDataList) {
        if (formData.getFieldType() == null || FieldType.TEXT.equals(formData.getFieldType())) {
            entity.addTextBody(formData.getFieldName(), ExecutorUtils.insertSavedValues(formData.getValue(), scenarioVariables), ContentType.create("text/plain", StandardCharsets.UTF_8));
        } else {
            log.debug("Try to identify Mime type projectPath = {}, formData = {}, fromData.getFilePath = {}", projectPath, formData, formData.getFilePath());
            File file = new File((projectPath == null ? "" : projectPath) + formData.getFilePath());
            String detectedMimeType = new Tika().detect(file);
            log.debug("Tika detection result = {}", detectedMimeType);
            log.debug("Try to get content type from formData.getMimeType = {}, tika detected mime type = {}", formData.getMimeType(), detectedMimeType);
            entity.addBinaryBody(
                    formData.getFieldName(),
                    file,
                    ContentType.parse( StringUtils.isEmpty(formData.getMimeType()) ? detectedMimeType : formData.getMimeType()),
                    file.getName()
            );
        }
    }
    return entity;
}
 
Example #2
Source File: Base64Utils.java    From NutzSite with Apache License 2.0 6 votes vote down vote up
/**
     * 将图片文件转换成base64字符串,参数为该图片的路径
     *
     * @param file
     * @return java.lang.String
     */
    public static String fileBase64(File file) {
        try {
            // check content type of the file
            Tika tika = new Tika();
            String contentType =tika.detect(file);
            // read data as byte[]
            byte[] data = Files.readAllBytes(file.toPath());
            // convert byte[] to base64(java7)
            String base64str = DatatypeConverter.printBase64Binary(data);
            // convert byte[] to base64(java8)
//             String base64str = Base64.getEncoder().encodeToString(data);
            // cretate "data URI"
            StringBuilder sb = new StringBuilder();
            sb.append("data:");
            sb.append(contentType);
            sb.append(";base64,");
            sb.append(base64str);
            System.out.println(sb.toString());
            return sb.toString();

        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
 
Example #3
Source File: ContentExtractor.java    From FXDesktopSearch with Apache License 2.0 6 votes vote down vote up
public ContentExtractor(final Configuration aConfiguration) {

        // TODO: auch korrekt dieses Muster verarbeitrn :  Mon Feb 18 15:55:10 CET 2013

        metaDataDatePattern = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})Z");

        configuration = aConfiguration;
        tika = new Tika();
        tika.setMaxStringLength(1024 * 1024 * 5);

        final var theDetector = new OptimaizeLangDetector();
        try {
            theDetector.loadModels();
            languageDetector = theDetector;
        } catch (final Exception e) {
            throw new RuntimeException(e);
        }
    }
 
Example #4
Source File: ActionToHtml.java    From o2oa with GNU Affero General Public License v3.0 6 votes vote down vote up
ActionResult<Wo> execute(EffectivePerson effectivePerson, byte[] bytes, FormDataContentDisposition disposition)
		throws Exception {
	ActionResult<Wo> result = new ActionResult<>();
	Tika tika = new Tika();
	String type = tika.detect(bytes);
	Wo wo = new Wo();
	switch (type) {
	case ("application/msword"):
		wo.setValue(this.doc(bytes));
		break;
	case ("application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
		wo.setValue(this.docx(bytes));
		break;
	default:
		throw new ExceptionUnsupportType(type);
	}
	result.setData(wo);
	return result;
}
 
Example #5
Source File: ResourceServiceImpl.java    From jwala with Apache License 2.0 6 votes vote down vote up
public ResourceServiceImpl(final ResourcePersistenceService resourcePersistenceService,
                           final GroupPersistenceService groupPersistenceService,
                           final ApplicationPersistenceService applicationPersistenceService,
                           final JvmPersistenceService jvmPersistenceService,
                           final WebServerPersistenceService webServerPersistenceService,
                           final ResourceDao resourceDao,
                           final ResourceHandler resourceHandler,
                           final ResourceContentGeneratorService resourceContentGeneratorService,
                           final BinaryDistributionService binaryDistributionService,
                           final Tika fileTypeDetector,
                           final RepositoryService repositoryService) {
    this.resourcePersistenceService = resourcePersistenceService;
    this.groupPersistenceService = groupPersistenceService;
    this.applicationPersistenceService = applicationPersistenceService;
    this.jvmPersistenceService = jvmPersistenceService;
    this.webServerPersistenceService = webServerPersistenceService;
    this.resourceDao = resourceDao;
    this.resourceHandler = resourceHandler;
    this.resourceContentGeneratorService = resourceContentGeneratorService;
    this.binaryDistributionService = binaryDistributionService;
    this.fileTypeDetector = fileTypeDetector;
    this.repositoryService = repositoryService;
}
 
Example #6
Source File: TikaProcessor.java    From jesterj with Apache License 2.0 5 votes vote down vote up
@Override
public Document[]   processDocument(Document document) {
  try {
    byte[] rawData = document.getRawData();
    if (rawData == null) {
      log.debug("Skipping document without data in " + getName());
      return new Document[]{document};
    }
    Tika tika = new Tika(tikaConfig);
    tika.setMaxStringLength(document.getRawData().length);
    Metadata metadata = new Metadata();
    try (ByteArrayInputStream bais = new ByteArrayInputStream(rawData)) {
      String textContent = tika.parseToString(bais, metadata, maxLength);
      if (replaceRaw) {
        document.setRawData(textContent.getBytes(StandardCharsets.UTF_8));
      }
      if (destField != null) {
        document.put(destField,textContent);
      }
      for (String name : metadata.names()) {
        document.put(sanitize(name) + plusSuffix(), metadata.get(name));
      }
    } catch (IOException | TikaException e) {
      log.debug("Tika processing failure!", e);
      // if tika can't parse it we certainly don't want random binary crap in the index
      document.setStatus(Status.ERROR);
    }
  } catch (Throwable t) {
    boolean isAccessControl = t instanceof AccessControlException;
    boolean isSecurity = t instanceof SecurityException;
    if (!isAccessControl && !isSecurity) {
      throw t;
    }
  }
  return new Document[]{document};
}
 
Example #7
Source File: DefaultMessagingConfigurer.java    From ogham with Apache License 2.0 5 votes vote down vote up
protected void configureImageInliningMimetype(MimetypeDetectionBuilder<?> builder) {
	// @formatter:off
	builder
		.tika()
			.instance(new Tika())
			.failIfOctetStream().defaultValue(overrideIfNotSet(true)).and()
			.and()
		.allowed().properties("${ogham.email.image-inlining.mimetype.allowed-mimetypes}").defaultValue(overrideIfNotSet(new String[] { "image/*" }));
	// @formatter:on
}
 
Example #8
Source File: DefaultMessagingConfigurer.java    From ogham with Apache License 2.0 5 votes vote down vote up
@Override
public void configure(MimetypeDetectionBuilder<?> builder) {
	// @formatter:off
	builder
		.tika()
			.instance(new Tika())
			.failIfOctetStream().properties("${ogham.mimetype.tika.fail-if-octet-stream}").defaultValue(overrideIfNotSet(true)).and()
			.and()
		.defaultMimetype().properties("${ogham.mimetype.default-mimetype}").defaultValue(overrideIfNotSet("application/octet-stream"));
	// @formatter:on
}
 
Example #9
Source File: ImageViewPanel.java    From Orienteer with Apache License 2.0 5 votes vote down vote up
public ImageViewPanel(String id, IModel<V> valueModel) {
	super(id, valueModel);

	byte[] imageBytes = (byte[]) getModelObject();
	if (imageBytes != null) {
		String mimeType = new Tika().detect(imageBytes);
		ByteArrayResource byteArrayResource = new ByteArrayResource(mimeType, imageBytes);
		add(new Image("image", byteArrayResource));
	} else {
		add(new EmptyPanel("image"));
	}
}
 
Example #10
Source File: ImageEditPanel.java    From Orienteer with Apache License 2.0 5 votes vote down vote up
@Override
public void validate() {
    super.validate();
    FileUpload fileUpload = fileUploadField.getFileUpload();
    if(fileUpload!=null) {
        byte[] bytes = fileUpload.getBytes();
        boolean isImage = new Tika().detect(bytes).startsWith("image/");
        if (!isImage) {
            error(getString("errors.wrong.image.uploaded"));
        }
    }
}
 
Example #11
Source File: DataURI.java    From osiam with MIT License 5 votes vote down vote up
/**
 * @param inputStream a inputStream which will be transformed into an DataURI
 * @throws IOException                 if the stream can not be read or is closed
 * @throws SCIMDataValidationException if the inputStream can't be converted into an DataURI
 */
public DataURI(InputStream inputStream) throws IOException {
    if (inputStream == null) {
        throw new SCIMDataValidationException("The given inputStream can't be null.");
    }
    String mimeType = new Tika().detect(inputStream);
    dataUri = convertInputStreamToDataURI(inputStream, mimeType);
}
 
Example #12
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
Example #13
Source File: FileRequest.java    From Bastion with GNU General Public License v3.0 5 votes vote down vote up
private void guessResourceMimeType(String resource) {
    String mimeType = new Tika().detect(resource);

    if (mimeType.equals("application/octet-stream")) {
        LOG.warning("Bastion might not have been able to determine the MIME type and is using" +
                " [application/octet-stream] for this request. Use setContentType() to change the MIME type.");
    }

    generalRequest.setContentType(ContentType.create(mimeType));
}
 
Example #14
Source File: SecureFileController.java    From cerberus with Apache License 2.0 5 votes vote down vote up
@Autowired
public SecureFileController(
    SecureDataService secureDataService,
    SecureDataVersionService secureDataVersionService,
    SdbAccessRequest sdbAccessRequest) {

  this.secureDataService = secureDataService;
  this.secureDataVersionService = secureDataVersionService;
  this.sdbAccessRequest = sdbAccessRequest;
  tika = new Tika();
}
 
Example #15
Source File: MyMimeTypeUtils.java    From spring-boot with Apache License 2.0 5 votes vote down vote up
/**
 * 利用 Tika 分析 Mime Type
 * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。
 * Parses the resource at the given URL and returns the extracted text content.
 *
 * @param url
 * @return
 */
public static String detect(URL url, int timeout) throws Exception {

    //网址不存在
    if (!MyUrlUtils.isURLAvailable(url, timeout)) {
        throw new Exception("exception ! " + url.getAuthority() + " not available");
    }
    Tika t = new Tika();
    return t.detect(url);
}
 
Example #16
Source File: MyMimeTypeUtils.java    From spring-boot with Apache License 2.0 5 votes vote down vote up
/**
 * 利用 Tika 分析 Mime Type
 * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。
 * <p>
 * Parses the given file and returns the extracted text content.
 *
 * @param file
 * @return
 */
public static String detect(File file) throws Exception {

    //文件不存在
    if (!file.exists()) {
        throw new Exception("exception ! " + file.getAbsoluteFile() + " not existes.");
    }
    Tika t = new Tika();
    return t.detect(file);

}
 
Example #17
Source File: TestUtil.java    From gplaymusic with MIT License 5 votes vote down vote up
public static void testDownload(String fileName, Track track) throws IOException {
  Path path = FileSystems.getDefault().getPath(System.getProperty("java.io.tmpdir"), fileName);
  track.download(StreamQuality.LOW, path);
  File file = path.toFile();
  Assert.assertTrue("File does not exist", file.exists());
  Assert.assertEquals("Is not an audio file", new Tika().detect(file), "audio/mpeg");
}
 
Example #18
Source File: ResourceServiceImplTest.java    From jwala with Apache License 2.0 5 votes vote down vote up
@Bean
public ResourceService getResourceService() {
    ResourceContentGeneratorService resourceContentGeneratorService = new ResourceContentGeneratorServiceImpl(mockGroupPesistenceService,
            mockWebServerPersistenceService, mockJvmPersistenceService, mockAppPersistenceService, mockHistoryFacadeService);
    Tika tika = new Tika();
    BinaryDistributionService mockBinaryDistributionService = mock(BinaryDistributionService.class);

    return new ResourceServiceImpl(mockResourcePersistenceService, mockGroupPesistenceService,
            mockAppPersistenceService, mockJvmPersistenceService, mockWebServerPersistenceService,
            mockResourceDao, mockResourceHandler,
            resourceContentGeneratorService, mockBinaryDistributionService, tika, mockRepositoryService);
}
 
Example #19
Source File: AemServiceConfiguration.java    From jwala with Apache License 2.0 5 votes vote down vote up
@Bean(name = "resourceService")
public ResourceService getResourceService(final ApplicationPersistenceService applicationPersistenceService,
                                          final JvmPersistenceService jvmPersistenceService,
                                          final WebServerPersistenceService webServerPersistenceService,
                                          final ResourceDao resourceDao,
                                          final WebServerResourceHandler webServerResourceHandler,
                                          final ResourceContentGeneratorService resourceContentGeneratorService,
                                          @Qualifier("resourceRepositoryService")
                                          final RepositoryService repositoryService) {
    return new ResourceServiceImpl(aemPersistenceServiceConfiguration.getResourcePersistenceService(),
            aemPersistenceServiceConfiguration.getGroupPersistenceService(), applicationPersistenceService,
            jvmPersistenceService, webServerPersistenceService, resourceDao, webServerResourceHandler,
            resourceContentGeneratorService, binaryDistributionService, new Tika(), repositoryService);
}
 
Example #20
Source File: TikaParserTest.java    From JQF with BSD 2-Clause "Simplified" License 5 votes vote down vote up
@Fuzz
public void fuzz(@From(InputStreamGenerator.class) InputStream in) throws IOException {
    Tika tika = new Tika();
    try(Reader reader = tika.parse(in)) {
        char[] buf = new char[1024];
        while (reader.read(buf) != -1); // Keep reading until EOF
    }

}
 
Example #21
Source File: Reference.java    From oodt with Apache License 2.0 5 votes vote down vote up
/**
 * <p>
 * Constructs a new Reference with the specified parameters.
 * </p>
 * 
 * @param origRef
 *            The item's original location.
 * @param dataRef
 *            The item's location within the data store.
 * @param size
 *            The size of the file that this reference refers to.
 */
public Reference(String origRef, String dataRef, long size) {
    origReference = origRef;
    dataStoreReference = dataRef;
    fileSize = size;
    // TODO: since no mimetype was specified, do the dirty work
    // ourselves to determine the which MimeType class to associate
    // with this reference.
    try {
        this.mimeType = mimeTypeRepository.forName(new Tika().detect(origRef));
    } catch (MimeTypeException e) {
        LOG.log(Level.SEVERE, e.getMessage());
    }

}
 
Example #22
Source File: MimeTypeUtils.java    From oodt with Apache License 2.0 5 votes vote down vote up
public MimeTypeUtils(InputStream mimeIs, boolean magic) {
	try {
		this.mimeTypes = MimeTypesFactory.create(mimeIs);
		this.mimeMagic = magic;
		this.tika = new Tika(new DefaultDetector(this.mimeTypes));
	}catch (Exception e) {
		LOG.log(Level.SEVERE, "Failed to load MimeType Registry : " + e.getMessage(), e);
	}
}
 
Example #23
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException {
    Tika tika = new Tika();
    Metadata metadata = new Metadata();

    tika.parse(stream, metadata);
    return metadata;
}
 
Example #24
Source File: MediaTypeValidator.java    From iaf with Apache License 2.0 5 votes vote down vote up
/**
 * Package default access because it specific for the conversion.
 */
public MediaTypeValidator(String pdfOutputlocation) {
	// Create only once. Tika seems to be thread safe
	// (see
	// http://stackoverflow.com/questions/10190980/spring-tika-integration-is-my-approach-thread-safe)
	tika = new Tika();
	this.pdfOutputlocation = pdfOutputlocation;
}
 
Example #25
Source File: MimeTypeUnitTest.java    From tutorials with MIT License 5 votes vote down vote up
/**
 * Test method demonstrating usage of Apache Tika.
 * 
 * @throws IOException
 */
@Test
public void whenUsingTika_thenSuccess() throws IOException {
    final File file = new File(FILE_LOC);
    final Tika tika = new Tika();
    final String mimeType = tika.detect(file);
    assertEquals(mimeType, PNG_EXT);
}
 
Example #26
Source File: ParserBolt.java    From storm-crawler with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void prepare(Map conf, TopologyContext context,
        OutputCollector collector) {

    emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true);

    urlFilters = URLFilters.fromConf(conf);

    parseFilters = ParseFilters.fromConf(conf);

    upperCaseElementNames = ConfUtils.getBoolean(conf,
            "parser.uppercase.element.names", true);

    extractEmbedded = ConfUtils.getBoolean(conf, "parser.extract.embedded",
            false);

    String htmlmapperClassName = ConfUtils.getString(conf,
            "parser.htmlmapper.classname",
            "org.apache.tika.parser.html.IdentityHtmlMapper");

    try {
        HTMLMapperClass = Class.forName(htmlmapperClassName);
        boolean interfaceOK = HtmlMapper.class
                .isAssignableFrom(HTMLMapperClass);
        if (!interfaceOK) {
            throw new RuntimeException("Class " + htmlmapperClassName
                    + " does not implement HtmlMapper");
        }
    } catch (ClassNotFoundException e) {
        LOG.error("Can't load class {}", htmlmapperClassName);
        throw new RuntimeException("Can't load class "
                + htmlmapperClassName);
    }

    mimeTypeWhiteList = ConfUtils.loadListFromConf(
            "parser.mimetype.whitelist", conf);

    protocolMDprefix = ConfUtils.getString(conf,
            ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, "");

    // instantiate Tika
    long start = System.currentTimeMillis();
    tika = new Tika();
    long end = System.currentTimeMillis();

    LOG.debug("Tika loaded in {} msec", end - start);

    this.collector = collector;

    this.eventCounter = context.registerMetric(this.getClass()
            .getSimpleName(), new MultiCountMetric(), 10);

    this.metadataTransfer = MetadataTransfer.getInstance(conf);
}
 
Example #27
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 4 votes vote down vote up
public ContentExtractor() {
	// autoDetectParser = new AutoDetectParser();
	txtParser = new TXTParser();
	tika = new Tika();
}
 
Example #28
Source File: TikaAnalysis.java    From tutorials with MIT License 4 votes vote down vote up
public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
    Tika tika = new Tika();
    String content = tika.parseToString(stream);
    return content;
}
 
Example #29
Source File: TikaAnalysis.java    From tutorials with MIT License 4 votes vote down vote up
public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
    Tika tika = new Tika();
    String mediaType = tika.detect(stream);
    return mediaType;
}
 
Example #30
Source File: OMailAttachment.java    From Orienteer with Apache License 2.0 4 votes vote down vote up
public DataSource toDataSource() {
    byte [] data = getData();
    ByteArrayDataSource dataSource = new ByteArrayDataSource(data, new Tika().detect(data));
    dataSource.setName(getName());
    return dataSource;
}